diff --git a/_quarto.yml b/_quarto.yml
index d676270a2..5d0b0949c 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -17,32 +17,32 @@ book:
     chapters:
         - index.md
         - intro_lec/introduction.qmd
-        # - pandas_1/pandas_1.qmd
-        # - pandas_2/pandas_2.qmd
-        # - pandas_3/pandas_3.qmd
-        # - eda/eda.qmd
-        # - regex/regex.qmd
-        # - visualization_1/visualization_1.qmd
-        # - visualization_2/visualization_2.qmd
-        # - sampling/sampling.qmd
-        # - intro_to_modeling/intro_to_modeling.qmd
-        # - constant_model_loss_transformations/loss_transformations.qmd
-        # - ols/ols.qmd
-        # - gradient_descent/gradient_descent.qmd
-        # - feature_engineering/feature_engineering.qmd
-        # - case_study_HCE/case_study_HCE.qmd
-        # - cv_regularization/cv_reg.qmd
-        # - probability_1/probability_1.qmd
-        # - probability_2/probability_2.qmd
-        # - inference_causality/inference_causality.qmd
-        # # - case_study_climate/case_study_climate.qmd
-        # - sql_I/sql_I.qmd
-        # - sql_II/sql_II.qmd
-        # - logistic_regression_1/logistic_reg_1.qmd
-        # - logistic_regression_2/logistic_reg_2.qmd
-        # - pca_1/pca_1.qmd
-        # - pca_2/pca_2.qmd
-        # - clustering/clustering.qmd
+        - pandas_1/pandas_1.qmd
+        - pandas_2/pandas_2.qmd
+        - pandas_3/pandas_3.qmd
+        - eda/eda.qmd
+        - regex/regex.qmd
+        - visualization_1/visualization_1.qmd
+        - visualization_2/visualization_2.qmd
+        - sampling/sampling.qmd
+        - intro_to_modeling/intro_to_modeling.qmd
+        - constant_model_loss_transformations/loss_transformations.qmd
+        - ols/ols.qmd
+        - gradient_descent/gradient_descent.qmd
+        - feature_engineering/feature_engineering.qmd
+        - case_study_HCE/case_study_HCE.qmd
+        - cv_regularization/cv_reg.qmd
+        - probability_1/probability_1.qmd
+        - probability_2/probability_2.qmd
+        - inference_causality/inference_causality.qmd
+        # - case_study_climate/case_study_climate.qmd
+        - sql_I/sql_I.qmd
+        - sql_II/sql_II.qmd
+        - logistic_regression_1/logistic_reg_1.qmd
+        - logistic_regression_2/logistic_reg_2.qmd
+        - pca_1/pca_1.qmd
+        - pca_2/pca_2.qmd
+        - clustering/clustering.qmd
     
     sidebar:
         logo: "data100_logo.png"
diff --git a/docs/Principles-and-Techniques-of-Data-Science.pdf b/docs/Principles-and-Techniques-of-Data-Science.pdf
deleted file mode 100644
index baa27bbe0..000000000
Binary files a/docs/Principles-and-Techniques-of-Data-Science.pdf and /dev/null differ
diff --git a/docs/case_study_HCE/case_study_HCE.html b/docs/case_study_HCE/case_study_HCE.html
new file mode 100644
index 000000000..5286221d5
--- /dev/null
+++ b/docs/case_study_HCE/case_study_HCE.html
@@ -0,0 +1,1513 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>15&nbsp; Case Study in Human Contexts and Ethics – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../cv_regularization/cv_reg.html" rel="next">
+<link href="../feature_engineering/feature_engineering.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../case_study_HCE/case_study_HCE.html"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Case Study in Human Contexts and Ethics</h2>
+   
+  <ul>
+  <li><a href="#the-problem" id="toc-the-problem" class="nav-link active" data-scroll-target="#the-problem"><span class="header-section-number">15.1</span> The Problem</a>
+  <ul>
+  <li><a href="#spotlight-appeals" id="toc-spotlight-appeals" class="nav-link" data-scroll-target="#spotlight-appeals"><span class="header-section-number">15.1.1</span> Spotlight: Appeals</a></li>
+  <li><a href="#human-impacts" id="toc-human-impacts" class="nav-link" data-scroll-target="#human-impacts"><span class="header-section-number">15.1.2</span> Human Impacts</a></li>
+  <li><a href="#spotlight-intersection-of-real-estate-and-race" id="toc-spotlight-intersection-of-real-estate-and-race" class="nav-link" data-scroll-target="#spotlight-intersection-of-real-estate-and-race"><span class="header-section-number">15.1.3</span> Spotlight: Intersection of Real Estate and Race</a></li>
+  </ul></li>
+  <li><a href="#the-response-cook-county-open-data-initiative" id="toc-the-response-cook-county-open-data-initiative" class="nav-link" data-scroll-target="#the-response-cook-county-open-data-initiative"><span class="header-section-number">15.2</span> The Response: Cook County Open Data Initiative</a>
+  <ul>
+  <li><a href="#questionproblem-formulation" id="toc-questionproblem-formulation" class="nav-link" data-scroll-target="#questionproblem-formulation"><span class="header-section-number">15.2.1</span> 1. Question/Problem Formulation</a></li>
+  <li><a href="#data-acquisition-and-cleaning" id="toc-data-acquisition-and-cleaning" class="nav-link" data-scroll-target="#data-acquisition-and-cleaning"><span class="header-section-number">15.2.2</span> 2. Data Acquisition and Cleaning</a></li>
+  <li><a href="#exploratory-data-analysis" id="toc-exploratory-data-analysis" class="nav-link" data-scroll-target="#exploratory-data-analysis"><span class="header-section-number">15.2.3</span> 3. Exploratory Data Analysis</a></li>
+  <li><a href="#prediction-and-inference" id="toc-prediction-and-inference" class="nav-link" data-scroll-target="#prediction-and-inference"><span class="header-section-number">15.2.4</span> 4. Prediction and Inference</a></li>
+  <li><a href="#results-and-conclusions" id="toc-results-and-conclusions" class="nav-link" data-scroll-target="#results-and-conclusions"><span class="header-section-number">15.2.5</span> 5. Results and Conclusions</a></li>
+  </ul></li>
+  <li><a href="#summary-questions-to-consider" id="toc-summary-questions-to-consider" class="nav-link" data-scroll-target="#summary-questions-to-consider"><span class="header-section-number">15.3</span> Summary: Questions to Consider</a></li>
+  <li><a href="#key-takeaways" id="toc-key-takeaways" class="nav-link" data-scroll-target="#key-takeaways"><span class="header-section-number">15.4</span> Key Takeaways</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></h1><button type="button" class="btn code-tools-button" id="quarto-code-tools-source"><i class="bi"></i> Code</button></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<p><strong>Note:</strong> Given the nuanced nature of some of the arguments made in the lecture, it is highly recommended that you view the lecture recording given by Professor Ari Edmundson to fully engage and understand the material. The course notes will have the same broader structure but are by no means comprehensive.</p>
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Learn about the ethical dilemmas that data scientists face.</li>
+<li>Examine the Cook County Assessor’s Office and Property Appraisal case study for fairness in housing appraisal.</li>
+<li>Know how to critique models using contextual knowledge about data.</li>
+</ul>
+</div>
+</div>
+</div>
+<blockquote class="blockquote">
+<p><strong>Disclaimer</strong>: The following note discusses issues of structural racism. Some of the items in this note may be sensitive and may or may not be the opinions, ideas, and beliefs of the students who collected the materials. The Data 100 course staff tries its best to only present information that is relevant for teaching the lessons at hand.</p>
+</blockquote>
+<p>As data scientists, our goal is to wrangle data, recognize patterns and use them to make predictions within a certain context. However, it is often easy to abstract data away from its original context. In previous lectures, we’ve explored datasets like <code>elections</code>, <code>babynames</code>, and <code>world_bank</code> to learn fundamental techniques for working with data, but rarely do we stop to ask questions like “How/when was this data collected?” or “Are there any inherent biases in the data that could affect results?”. It turns out that inquiries like these profoundly affect how data scientists approach a task and convey their findings. This lecture explores these ethical dilemmas through the lens of a case study.</p>
+<p>Let’s immerse ourselves in the real-world story of data scientists working for an organization called the Cook County Assessor’s Office (CCAO) located in Chicago, Illinois. Their job is to <strong>estimate the values of houses</strong> in order to <strong>assign property taxes</strong>. This is because the tax burden in this area is determined by the estimated <strong>value</strong> of a house rather than its price. Since value changes over time and has no obvious indicators, the CCAO created a <strong>model</strong> to estimate the values of houses. In this note, we will dig deep into biases that arose in the model, the consequences to human lives, and what we can learn from this example to avoid the same mistakes in the future.</p>
+<section id="the-problem" class="level2" data-number="15.1">
+<h2 data-number="15.1" class="anchored" data-anchor-id="the-problem"><span class="header-section-number">15.1</span> The Problem</h2>
+<p>What prompted the formation of the CCAO and led to the development of this model? In 2017, an <a href="https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/assessments.html">investigative report</a> by the <em>Chicago Tribune</em> uncovered a major scandal in the property assessment system managed by the CCAO under the watch of former County Assessor Joseph Berrios. Working with experts from the University of Chicago, the <em>Chicago Tribune</em> journalists found that the CCAO’s model for estimating house value perpetuated a highly regressive tax system that disproportionately burdened African-American and Latinx homeowners in Cook County. How did the journalists demonstrate this disparity?</p>
+<center>
+<a href="https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/assessments.html"> <img src="images/vis_1.png"></a>
+</center>
+<p>The image above shows two standard metrics to estimate the fairness of assessments: the <a href="https://www.realestateagent.com/real-estate-glossary/real-estate/coefficient-of-dispersion.html">coefficient of dispersion</a> and <a href="https://leg.wa.gov/House/Committees/FIN/Documents/2009/RatioText.pdf">price-related differential</a>. How they’re calculated is out of scope for this class, but you can assume that these metrics have been rigorously tested by experts in the field and are a good indication of fairness. As we see above, calculating these metrics for the Cook County prices revealed that the pricing created by the CCAO did not fall in acceptable ranges. While this on its own is <strong>not the entire</strong> story, it was a good indicator that <strong>something fishy was going on</strong>.</p>
+<center>
+<a href="https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/assessments.html"> <img src="images/vis_2.png" width="300"></a>
+</center>
+<p>This prompted journalists to investigate if the CCAO’s model itself was producing fair tax rates. When accounting for the homeowner’s income, they found that the model actually produced a <strong>regressive</strong> tax rate (see figure above). A tax rate is <strong>regressive</strong> if the percentage tax rate is higher for individuals with lower net income; it is <strong>progressive</strong> if the percentage tax rate is higher for individuals with higher net income.</p>
+<center>
+<a href="https://www.clccrul.org/bpnc-v-berrios-facts?rq=berrios"> <img src="images/vis_3.jpg" width="600"> </a>
+</center>
+<p><br></p>
+<p>Digging further, journalists found that the model was not only regressive and unfair to lower-income individuals, but it was also unfair to non-white homeowners (see figure above). The likelihood of a property being under- or over-assessed was highly dependent on the owner’s race, and that did not sit well with many homeowners.</p>
+<section id="spotlight-appeals" class="level3" data-number="15.1.1">
+<h3 data-number="15.1.1" class="anchored" data-anchor-id="spotlight-appeals"><span class="header-section-number">15.1.1</span> Spotlight: Appeals</h3>
+<p>What was the cause of such a major issue? It might be easy to simply blame “biased” algorithms, but the main issue was not a faulty model. Instead, it was largely due to the <strong>appeals system</strong> which enabled the wealthy and privileged to more easily and successfully challenge their assessments. Once given the CCAO model’s initial assessment of their home’s value, homeowners could choose to appeal to a board of elected officials to try and change the listed value of their home and, consequently, how much they are taxed. In theory, this sounds like a very fair system: a human being oversees the final pricing of houses rather than a computer algorithm. In reality, this ended up exacerbating the problem.</p>
+<blockquote class="blockquote">
+<p>“Appeals are a good thing,” Thomas Jaconetty, deputy assessor for valuation and appeals, said in an interview. “The goal here is fairness. We made the numbers. We can change them.”</p>
+</blockquote>
+<center>
+<a href="https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/appeals.html"> <img src="images/vis_4.png"> </a>
+</center>
+<p><br></p>
+<p>We can borrow lessons from <a href="https://www.britannica.com/topic/critical-race-theory">Critical Race Theory</a> —— on the surface, everyone has the legal right to try and appeal the value of their home. However, not everyone has an <em>equal ability</em> to do so. Those who have the money to hire tax lawyers to appeal for them have a drastically higher chance of trying and succeeding in their appeal (see above figure). Many homeowners who appealed were generally under-assessed compared to homeowners who did not (see figure below). Clearly, the model is part of a deeper institutional pattern rife with potential corruption.</p>
+<center>
+<a href="https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/appeals.html"> <img src="images/vis_5.png"> </a>
+</center>
+<p><br></p>
+<p>In fact, Chicago boasts a large and thriving tax attorney industry dedicated precisely to appealing property assessments, reflected in the growing number of appeals in Cook County in the 21st century. Given wealthier, whiter neighborhoods typically have greater access to lawyers, they often appealed more and won reductions far more often than their less wealthy neighbors. In other words, those with higher incomes pay less in property tax, tax lawyers can grow their business due to their role in appeals, and politicians are socially connected to the aforementioned tax lawyers and wealthy homeowners. All these stakeholders have reasons to advertise the appeals system as an integral part of a fair system; after all, it serves to benefit them. Here lies the value in asking questions: a system that seems fair on the surface may, in reality, be unfair upon taking a closer look.</p>
+</section>
+<section id="human-impacts" class="level3" data-number="15.1.2">
+<h3 data-number="15.1.2" class="anchored" data-anchor-id="human-impacts"><span class="header-section-number">15.1.2</span> Human Impacts</h3>
+<center>
+<a href="https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/assessments.html"> <img src="images/vis_6.png"> </a>
+</center>
+<p><br></p>
+<p>What happened as a result of this corrupt system? As the <em>Chicago Tribune</em> reported, many African American and Latino homeowners purchased homes only to find their houses were later appraised at levels far higher than what they paid. As a result, homeowners were now responsible for paying significantly more in taxes every year than initially budgeted, putting them at risk of not being able to afford their homes and losing them.</p>
+<p>The impact of the housing model extends beyond the realm of home ownership and taxation —— the issues of justice go much deeper. This model perpetrated much older patterns of racially discriminatory practices in Chicago and across the United States. Unfortunately, it is no accident that this happened in Chicago, one of the most segregated cities in the United States (<a href="https://fivethirtyeight.com/features/the-most-diverse-cities-are-often-the-most-segregated/">source</a>). These factors are central to informing us, as data scientists, about what is at stake.</p>
+</section>
+<section id="spotlight-intersection-of-real-estate-and-race" class="level3" data-number="15.1.3">
+<h3 data-number="15.1.3" class="anchored" data-anchor-id="spotlight-intersection-of-real-estate-and-race"><span class="header-section-number">15.1.3</span> Spotlight: Intersection of Real Estate and Race</h3>
+<p>Before we dive into how the CCAO used data science to “solve” this problem, let’s briefly go through the history of discriminatory housing practices in the United States to give more context on the gravity and urgency of this situation.</p>
+<p>Housing and real estate, among other factors, have been one of the most significant and enduring drivers of structural racism and racial inequality in the United States since the Civil War. It is one of the main areas where inequalities are created and reproduced. In the early 20th century, <a href="https://www.history.com/topics/early-20th-century-us/jim-crow-laws">Jim Crow</a> laws were explicit in forbidding people of color from utilizing the same facilities —— such as buses, bathrooms, and pools —— as white individuals. This set of practices by government actors in combination with overlapping practices driven by the private real estate industry further served to make neighborhoods increasingly segregated.</p>
+<center>
+<a href="https://dsl.richmond.edu/panorama/redlining/#loc=11/41.84/-87.674"><img src="images/vis_7.png"></a>
+</center>
+<p><br></p>
+<p>Although advancements in civil rights have been made, the spirit of the laws is alive in many parts of the US. In the 1920s and 1930s, it was illegal for governments to actively segregate neighborhoods according to race, but other methods were available for achieving the same ends. One of the most notorious practices was <strong>redlining</strong>: the federal housing agencies’ process of distinguishing neighborhoods in a city in terms of relative risk. The goal was to increase access to homeownership for low-income Americans. In practice, however, it allowed real estate professionals to legally perpetuate segregation. The federal housing agencies deemed predominantly African American neighborhoods as high risk and colored them in red —— hence the name redlining ——&nbsp;making it nearly impossible for African Americans to own a home.</p>
+<p>The origins of the data that made these maps possible lay in a kind of “racial data revolution” in the private real estate industry beginning in the 1920s. Segregation was established and reinforced in part through the work of real estate agents who were also very concerned with establishing reliable methods for predicting the value of a home. The effects of these practices continue to resonate today.</p>
+<center>
+<img src="images/vis_8.png">
+<figcaption>
+Source: Colin Koopman, How We Became Our Data (2019) p.&nbsp;137
+</figcaption>
+</center>
+<p><br></p>
+</section>
+</section>
+<section id="the-response-cook-county-open-data-initiative" class="level2" data-number="15.2">
+<h2 data-number="15.2" class="anchored" data-anchor-id="the-response-cook-county-open-data-initiative"><span class="header-section-number">15.2</span> The Response: Cook County Open Data Initiative</h2>
+<p>The response to this problem started in politics. A new assessor, Fritz Kaegi, was elected and created a new mandate with two goals:</p>
+<ol type="1">
+<li>Distributional equity in property taxation, meaning that properties of the same value are treated alike during assessments.</li>
+<li>Creating a new Office of Data Science.</li>
+</ol>
+<p>He wanted to not only create a more accurate algorithmic model but also to design a new system to address the problems with the CCAO.</p>
+<center>
+<img src="images/vis_9.png" width="300px">
+</center>
+<p><br></p>
+<p>Let’s frame this problem through the lens of the data science lifecycle.</p>
+<center>
+<img src="images/data_life_cycle.PNG" width="300px">
+</center>
+<section id="questionproblem-formulation" class="level3" data-number="15.2.1">
+<h3 data-number="15.2.1" class="anchored" data-anchor-id="questionproblem-formulation"><span class="header-section-number">15.2.1</span> 1. Question/Problem Formulation</h3>
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Driving Questions
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<ul>
+<li>What do we want to know?</li>
+<li>What problems are we trying to solve?</li>
+<li>What are the hypotheses we want to test?</li>
+<li>What are our metrics for success?</li>
+</ul>
+</div>
+</div>
+<p>The old system was unfair because it was systemically inaccurate; it made one kind of error for one group, and another kind of error for another. Its goal was to “create a robust pipeline that accurately assesses property values at scale and is fair”, and in turn, they defined fairness as accuracy: “the ability of our pipeline to accurately assess all residential property values, accounting for disparities in geography, information, etc.” Thus, the plan —— make the system more fair —— was already framed in terms of a task appropriate to a data scientist: make the assessments more accurate (or more precisely, minimize errors in a particular way).</p>
+<p>The idea here is that if the model is more accurate it will also (perhaps necessarily) become more fair, which is a big assumption. There are, in a sense, two different problems —— make accurate assessments, and make a fair system. Treating these two problems as one makes it a more straightforward issue that can be solved technically (with a good model) but does raise the question of if fairness and accuracy are one and the same.</p>
+<p>For now, let’s just talk about the technical part of this —— accuracy. For you, the data scientist, this part might feel more comfortable. We can determine some metrics of success and frame a social problem as a data science problem.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+<b>Definitions</b>: Fairness and Transparency
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>The definitions, as given by the Cook County Assessor’s Office, are given below: <br></p>
+<ul>
+<li>Fairness: The ability of our pipeline to accurately assess property values, accounting for disparities in geography, information, etc. <br></li>
+<li>Transparency: The ability of the data science department to share and explain pipeline results and decisions to both internal and external stakeholders <br></li>
+</ul>
+</div>
+</div>
+<p>The new Office of Data Science started by framing the problem and redefining their goals. They determined that they needed to:</p>
+<ol type="1">
+<li>Accurately, uniformly, and impartially assess the value of a home and accurately predict the sale price of a home within the next year by:
+<ul>
+<li>Following international standards (e.g., coefficient of dispersion)</li>
+<li>Predicting the value of all homes with as little total error as possible</li>
+</ul></li>
+<li>Create a robust pipeline that accurately assesses property values at scale and is fair to all people by:
+<ul>
+<li>Disrupting the circuit of corruption (Board of Review appeals process)</li>
+<li>Eliminating regressivity</li>
+<li>Engendering trust in the system among all stakeholders</li>
+</ul></li>
+</ol>
+<p>The goals defined above lead us to ask the question: what does it actually mean to accurately assess property values, and what role does “scale” play?</p>
+<ol type="1">
+<li>What is an assessment of a home’s value?</li>
+<li>What makes one assessment more accurate than another?</li>
+<li>What makes one batch of assessments more accurate than another batch?</li>
+</ol>
+<p>Each of the above questions leads to a slew of more questions. Considering just the first question, one answer could be that an assessment is an estimate of the value of a home. This leads to more inquiries: what is the value of a home? What determines it? How do we know? For this class, we take it to be the house’s market value, or how much it would sell for.</p>
+<p>Unfortunately, if you are the county assessor, it becomes hard to determine property values with this definition. After all, you can’t make everyone sell their house every year. And as many properties haven’t been sold in decades, every year that passes makes that previous sale less reliable as an indicator.</p>
+<p>So how would one generate reliable estimates? You’re probably thinking, well, with data about homes and their sale prices you can probably predict the value of a property reliably. Even if you’re not a data scientist, you might know there are websites like Zillow and RedFin that estimate what properties would sell for and constantly update them. They don’t know the value, but they estimate them. How do you think they do this? Let’s start with the data —— which is the next step in the lifecycle.</p>
+</section>
+<section id="data-acquisition-and-cleaning" class="level3" data-number="15.2.2">
+<h3 data-number="15.2.2" class="anchored" data-anchor-id="data-acquisition-and-cleaning"><span class="header-section-number">15.2.2</span> 2. Data Acquisition and Cleaning</h3>
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Driving Questions
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<ul>
+<li>What data do we have, and what data do we need?</li>
+<li>How will we sample more data?</li>
+<li>Is our data representative of the population we want to study?</li>
+</ul>
+</div>
+</div>
+<p>To generate estimates, the data scientists used two datasets. The first contained all recorded sales data from 2013 to 2019. The second contained property characteristics, including a property identification number and physical characteristics (e.g., age, bedroom, baths, square feet, neighborhood, site desirability, etc.).</p>
+<center>
+<img src="images/vis_10.png">
+</center>
+<p><br></p>
+<p>As they examined the datasets, they asked the questions:</p>
+<ol type="1">
+<li>How was this data collected?</li>
+<li>When was this data collected?</li>
+<li>Who collected this data?</li>
+<li>For what purposes was the data collected?</li>
+<li>How and why were particular categories created?</li>
+</ol>
+<p>With so much data available, data scientists worked to see how all the different data points correlated with each other and with the sales prices. By discovering patterns in datasets containing known sale prices and characteristics of similar and nearby properties, training a model on this data, and applying it to all the properties without sales data, it was now possible to create a linear model that could predict the sale price (“fair market value”) of unsold properties.</p>
+<p>Some other key questions data scientists asked about the data were:</p>
+<ol type="1">
+<li>Are any attributes of a house differentially reported? How might these attributes be differentially reported?</li>
+<li>How are “improvements” like renovations tracked and updated?</li>
+<li>Which data is missing, and for which neighborhoods or populations is it missing?</li>
+<li>What other data sources or attributes might be valuable?</li>
+</ol>
+<p>Attributes can have different likelihoods of appearing in the data. For example, housing data in the floodplain geographic region of Chicago were less represented than other regions.</p>
+<p>Features can also be reported at different rates. Improvements in homes, which tend to increase property value, were unlikely to be reported by the homeowners.</p>
+<p>Additionally, they found that there was simply more missing data in lower-income neighborhoods.</p>
+</section>
+<section id="exploratory-data-analysis" class="level3" data-number="15.2.3">
+<h3 data-number="15.2.3" class="anchored" data-anchor-id="exploratory-data-analysis"><span class="header-section-number">15.2.3</span> 3. Exploratory Data Analysis</h3>
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Driving Questions
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<ul>
+<li>How is our data organized, and what does it contain?</li>
+<li>Do we already have relevant data?</li>
+<li>What are the biases, anomalies, or other issues with the data?</li>
+<li>How do we transform the data to enable effective analysis?</li>
+</ul>
+</div>
+</div>
+<p>Before the modeling step, they investigated a multitude of crucial questions:</p>
+<ol type="1">
+<li>Which attributes are most predictive of sales price?</li>
+<li>Is the data uniformly distributed?</li>
+<li>Do all neighborhoods have recent data? Do all neighborhoods have the same granularity?<br>
+</li>
+<li>Do some neighborhoods have missing or outdated data?</li>
+</ol>
+<p>They found that certain features, such as bedroom number, were much more useful in determining house value for certain neighborhoods than for others. This informed them that different models should be used depending on the neighborhood.</p>
+<p>They also noticed that low-income neighborhoods had disproportionately spottier data. This informed them that they needed to develop new data collection practices - including finding new sources of data.</p>
+</section>
+<section id="prediction-and-inference" class="level3" data-number="15.2.4">
+<h3 data-number="15.2.4" class="anchored" data-anchor-id="prediction-and-inference"><span class="header-section-number">15.2.4</span> 4. Prediction and Inference</h3>
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Driving Questions
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<ul>
+<li>What does the data say about the world?</li>
+<li>Does it answer our questions or accurately solve the problem?</li>
+<li>How robust are our conclusions, and can we trust the predictions?</li>
+</ul>
+</div>
+</div>
+<p>Rather than using a singular model to predict sale prices (“fair market value”) of unsold properties, the CCAO predicts sale prices using machine learning models that discover patterns in data sets containing known sale prices and characteristics of <strong>similar and nearby properties</strong>. It uses different model weights for each neighborhood.</p>
+<p>Compared to traditional mass appraisal, the CCAO’s new approach is more granular and more sensitive to neighborhood variations.</p>
+<p>But how do we know if an assessment is accurate? We can see how our model performs when predicting the sales prices of properties it wasn’t trained on! We can then evaluate how “close” our estimate was to the actual sales price, using Root Mean Square Error (RMSE). However, is RMSE a good proxy for fairness in this context?</p>
+<p>Broad metrics of error like RMSE can be limiting when evaluating the “fairness” of a property appraisal system. RMSE does not tell us anything about the distribution of errors, whether the errors are positive or negative, and the relative size of the errors. It does not tell us anything about the regressivity of the model, instead just giving a rough measure of our model’s overall error.</p>
+<p>Even with a low RMSE, we can’t guarantee a fair model. The error we see (no matter how small) may be a result of our model overvaluing less expensive homes and undervaluing more expensive homes.</p>
+<p>Regarding accuracy, it’s important to ask what makes a batch of assessments better or more accurate than another batch of assessments. The value of a home that a model predicts is relational. It’s a product of the interaction of social and technical elements so property assessment involves social trust.</p>
+<p>Why should any particular individual believe that the model is accurate for their property? Why should any individual trust the model?</p>
+<p>To foster public trust, the CCAO focuses on “transparency”, putting data, models, and the pipeline onto GitLab. By doing so, they can better equate the production of “accurate assessments” with “fairness”.</p>
+<p>There’s a lot more to be said here on the relationship between accuracy, fairness, and metrics we tend to use when evaluating our models. Given the nuanced nature of the argument, it is recommended you view the corresponding lecture as the course notes are not as comprehensive for this portion of the lecture.</p>
+</section>
+<section id="results-and-conclusions" class="level3" data-number="15.2.5">
+<h3 data-number="15.2.5" class="anchored" data-anchor-id="results-and-conclusions"><span class="header-section-number">15.2.5</span> 5. Results and Conclusions</h3>
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Driving Questions
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<ul>
+<li>How successful is the system for each goal?
+<ul>
+<li>Accuracy/uniformity of the model</li>
+<li>Fairness and transparency that eliminates regressivity and engenders trust</li>
+</ul></li>
+<li>How do you know?</li>
+</ul>
+</div>
+</div>
+<p>Unfortunately, it may be naive to hope that a more accurate and transparent algorithm will translate into more fair outcomes in practice. Even if our model is perfectly optimized according to the standards of fairness we’ve set, there is no guarantee that people will actually pay their expected share of taxes as determined by the model. While it is a good step in the right direction, maintaining a level of social trust is key to ensuring people pay their fair share.</p>
+<p>Despite all their best efforts, the CCAO is still struggling to create fair assessments and engender trust.</p>
+<p>Stories like <a href="https://www.axios.com/local/chicago/2022/12/01/why-chicagos-property-tax-bills-so-high">the one</a> show that total taxes for residential properties went up overall (because commercial taxes went down). But looking at the distribution, we can see that the biggest increases occurred in wealthy neighborhoods, and the biggest decreases occurred in poorer, predominantly Black neighborhoods. So maybe there was some success after all?</p>
+<p>However, it’ll ultimately be hard to overcome the propensity of the board of review to reduce the tax burden of the rich, preventing the CCAO from creating a truly fair system. This is in part because there are many cases where the model makes big, frustrating mistakes. In some cases like <a href="https://www.axios.com/local/chicago/2023/05/22/cook-county-property-tax-appeal-process">this one</a>, it is due to spotty data.</p>
+</section>
+</section>
+<section id="summary-questions-to-consider" class="level2" data-number="15.3">
+<h2 data-number="15.3" class="anchored" data-anchor-id="summary-questions-to-consider"><span class="header-section-number">15.3</span> Summary: Questions to Consider</h2>
+<ol type="1">
+<li><p>Question/Problem Formulation</p>
+<ul>
+<li>Who is responsible for framing the problem?</li>
+<li>Who are the stakeholders? How are they involved in the problem framing?</li>
+<li>What do you bring to the table? How does your positionality affect your understanding of the problem?</li>
+<li>What are the narratives that you’re tapping into?</li>
+</ul></li>
+<li><p>Data Acquisition and Cleaning</p>
+<ul>
+<li>Where does the data come from?</li>
+<li>Who collected it? For what purpose?</li>
+<li>What kinds of collecting and recording systems and techniques were used?</li>
+<li>How has this data been used in the past?</li>
+<li>What restrictions are there on access to the data, and what enables you to have access?</li>
+</ul></li>
+<li><p>Exploratory Data Analysis &amp; Visualization</p>
+<ul>
+<li>What kind of personal or group identities have become salient in this data?</li>
+<li>Which variables became salient, and what kinds of relationships do we see between them?</li>
+<li>Do any of the relationships made visible lend themselves to arguments that might be potentially harmful to a particular community?</li>
+</ul></li>
+<li><p>Prediction and Inference</p>
+<ul>
+<li>What does the prediction or inference do in the world?</li>
+<li>Are the results useful for the intended purposes?</li>
+<li>Are there benchmarks to compare the results?</li>
+<li>How are your predictions and inferences dependent upon the larger system in which your model works?</li>
+</ul></li>
+</ol>
+</section>
+<section id="key-takeaways" class="level2" data-number="15.4">
+<h2 data-number="15.4" class="anchored" data-anchor-id="key-takeaways"><span class="header-section-number">15.4</span> Key Takeaways</h2>
+<ol type="1">
+<li>Accuracy is a necessary, but not sufficient, condition of a fair system.</li>
+<li>Fairness and transparency are context-dependent and <strong>sociotechnical</strong> concepts.</li>
+<li>Learn to work with contexts, and consider how your data analysis will reshape them.</li>
+<li>Keep in mind the power, and limits, of data analysis.</li>
+</ol>
+
+
+<!-- -->
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../feature_engineering/feature_engineering.html" class="pagination-link" aria-label="Feature Engineering">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../cv_regularization/cv_reg.html" class="pagination-link" aria-label="Cross Validation and Regularization">
+        <span class="nav-page-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb1" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Case Study in Human Contexts and Ethics</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Case Study in Human Contexts and Ethics</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb1-24"><a href="#cb1-24" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb1-25"><a href="#cb1-25" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb1-26"><a href="#cb1-26" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb1-27"><a href="#cb1-27" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb1-28"><a href="#cb1-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-29"><a href="#cb1-29" aria-hidden="true" tabindex="-1"></a>**Note:** Given the nuanced nature of some of the arguments made in the lecture, it is highly recommended that you view the lecture recording given by Professor Ari Edmundson to fully engage and understand the material. The course notes will have the same broader structure but are by no means comprehensive.</span>
+<span id="cb1-30"><a href="#cb1-30" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-31"><a href="#cb1-31" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb1-32"><a href="#cb1-32" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb1-33"><a href="#cb1-33" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Learn about the ethical dilemmas that data scientists face.</span>
+<span id="cb1-34"><a href="#cb1-34" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Examine the Cook County Assessor’s Office and Property Appraisal case study for fairness in housing appraisal.</span>
+<span id="cb1-35"><a href="#cb1-35" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Know how to critique models using contextual knowledge about data.</span>
+<span id="cb1-36"><a href="#cb1-36" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-37"><a href="#cb1-37" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-38"><a href="#cb1-38" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; **Disclaimer**: The following note discusses issues of structural racism. Some of the items in this note may be sensitive and may or may not be the opinions, ideas, and beliefs of the students who collected the materials. The Data 100 course staff tries its best to only present information that is relevant for teaching the lessons at hand.</span></span>
+<span id="cb1-39"><a href="#cb1-39" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-40"><a href="#cb1-40" aria-hidden="true" tabindex="-1"></a>As data scientists, our goal is to wrangle data, recognize patterns and use them to make predictions within a certain context. However, it is often easy to abstract data away from its original context. In previous lectures, we've explored datasets like <span class="in">`elections`</span>, <span class="in">`babynames`</span>, and <span class="in">`world_bank`</span> to learn fundamental techniques for working with data, but rarely do we stop to ask questions like "How/when was this data collected?" or "Are there any inherent biases in the data that could affect results?". It turns out that inquiries like these profoundly affect how data scientists approach a task and convey their findings. This lecture explores these ethical dilemmas through the lens of a case study. </span>
+<span id="cb1-41"><a href="#cb1-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-42"><a href="#cb1-42" aria-hidden="true" tabindex="-1"></a>Let's immerse ourselves in the real-world story of data scientists working for an organization called the Cook County Assessor's Office (CCAO) located in Chicago, Illinois. Their job is to **estimate the values of houses** in order to **assign property taxes**. This is because the tax burden in this area is determined by the estimated **value** of a house rather than its price. Since value changes over time and has no obvious indicators, the CCAO created a **model** to estimate the values of houses. In this note, we will dig deep into biases that arose in the model, the consequences to human lives, and what we can learn from this example to avoid the same mistakes in the future.</span>
+<span id="cb1-43"><a href="#cb1-43" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-44"><a href="#cb1-44" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-45"><a href="#cb1-45" aria-hidden="true" tabindex="-1"></a><span class="fu">## The Problem</span></span>
+<span id="cb1-46"><a href="#cb1-46" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-47"><a href="#cb1-47" aria-hidden="true" tabindex="-1"></a>What prompted the formation of the CCAO and led to the development of this model? In 2017, an <span class="co">[</span><span class="ot">investigative report</span><span class="co">](https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/assessments.html)</span> by the *Chicago Tribune* uncovered a major scandal in the property assessment system managed by the CCAO under the watch of former County Assessor Joseph Berrios. Working with experts from the University of Chicago, the *Chicago Tribune* journalists found that the CCAO's model for estimating house value perpetuated a highly regressive tax system that disproportionately burdened African-American and Latinx homeowners in Cook County. How did the journalists demonstrate this disparity? </span>
+<span id="cb1-48"><a href="#cb1-48" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-49"><a href="#cb1-49" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;a href="https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/assessments.html"&gt;</span>
+<span id="cb1-50"><a href="#cb1-50" aria-hidden="true" tabindex="-1"></a>&lt;img src = "images/vis_1.png"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/center&gt;</span>
+<span id="cb1-51"><a href="#cb1-51" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-52"><a href="#cb1-52" aria-hidden="true" tabindex="-1"></a>The image above shows two standard metrics to estimate the fairness of assessments: the <span class="co">[</span><span class="ot">coefficient of dispersion</span><span class="co">](https://www.realestateagent.com/real-estate-glossary/real-estate/coefficient-of-dispersion.html)</span> and <span class="co">[</span><span class="ot">price-related differential</span><span class="co">](https://leg.wa.gov/House/Committees/FIN/Documents/2009/RatioText.pdf)</span>. How they're calculated is out of scope for this class, but you can assume that these metrics have been rigorously tested by experts in the field and are a good indication of fairness. As we see above, calculating these metrics for the Cook County prices revealed that the pricing created by the CCAO did not fall in acceptable ranges. While this on its own is **not the entire** story, it was a good indicator that **something fishy was going on**.</span>
+<span id="cb1-53"><a href="#cb1-53" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-54"><a href="#cb1-54" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;a href="https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/assessments.html"&gt;</span>
+<span id="cb1-55"><a href="#cb1-55" aria-hidden="true" tabindex="-1"></a>&lt;img src = "images/vis_2.png" width = "300"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/center&gt;</span>
+<span id="cb1-56"><a href="#cb1-56" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-57"><a href="#cb1-57" aria-hidden="true" tabindex="-1"></a>This prompted journalists to investigate if the CCAO's model itself was producing fair tax rates. When accounting for the homeowner's income, they found that the model actually produced a **regressive** tax rate (see figure above). A tax rate is **regressive** if the percentage tax rate is higher for individuals with lower net income; it is **progressive** if the percentage tax rate is higher for individuals with higher net income. </span>
+<span id="cb1-58"><a href="#cb1-58" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-59"><a href="#cb1-59" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;a href="https://www.clccrul.org/bpnc-v-berrios-facts?rq=berrios"&gt;</span>
+<span id="cb1-60"><a href="#cb1-60" aria-hidden="true" tabindex="-1"></a>&lt;img src = "images/vis_3.jpg" width = "600"&gt;&lt;/img&gt;</span>
+<span id="cb1-61"><a href="#cb1-61" aria-hidden="true" tabindex="-1"></a>&lt;/a&gt;&lt;/center&gt;</span>
+<span id="cb1-62"><a href="#cb1-62" aria-hidden="true" tabindex="-1"></a>&lt;br&gt;</span>
+<span id="cb1-63"><a href="#cb1-63" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-64"><a href="#cb1-64" aria-hidden="true" tabindex="-1"></a>Digging further, journalists found that the model was not only regressive and unfair to lower-income individuals, but it was also unfair to non-white homeowners (see figure above). The likelihood of a property being under- or over-assessed was highly dependent on the owner's race, and that did not sit well with many homeowners.</span>
+<span id="cb1-65"><a href="#cb1-65" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-66"><a href="#cb1-66" aria-hidden="true" tabindex="-1"></a><span class="fu">### Spotlight: Appeals</span></span>
+<span id="cb1-67"><a href="#cb1-67" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-68"><a href="#cb1-68" aria-hidden="true" tabindex="-1"></a>What was the cause of such a major issue? It might be easy to simply blame "biased" algorithms, but the main issue was not a faulty model. Instead, it was largely due to the **appeals system** which enabled the wealthy and privileged to more easily and successfully challenge their assessments. Once given the CCAO model's initial assessment of their home's value, homeowners could choose to appeal to a board of elected officials to try and change the listed value of their home and, consequently, how much they are taxed. In theory, this sounds like a very fair system: a human being oversees the final pricing of houses rather than a computer algorithm. In reality, this ended up exacerbating the problem. </span>
+<span id="cb1-69"><a href="#cb1-69" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-70"><a href="#cb1-70" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; “Appeals are a good thing,” Thomas Jaconetty, deputy assessor for valuation and appeals, said in an interview. “The goal here is fairness. We made the numbers. We can change them.”</span></span>
+<span id="cb1-71"><a href="#cb1-71" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-72"><a href="#cb1-72" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;a href = "https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/appeals.html"&gt; &lt;img src = "images/vis_4.png"&gt;&lt;/img&gt;</span>
+<span id="cb1-73"><a href="#cb1-73" aria-hidden="true" tabindex="-1"></a>&lt;/a&gt;&lt;/center&gt;</span>
+<span id="cb1-74"><a href="#cb1-74" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-75"><a href="#cb1-75" aria-hidden="true" tabindex="-1"></a>&lt;br /&gt;</span>
+<span id="cb1-76"><a href="#cb1-76" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-77"><a href="#cb1-77" aria-hidden="true" tabindex="-1"></a>We can borrow lessons from <span class="co">[</span><span class="ot">Critical Race Theory</span><span class="co">](https://www.britannica.com/topic/critical-race-theory)</span> —— on the surface, everyone has the legal right to try and appeal the value of their home. However, not everyone has an *equal ability* to do so. Those who have the money to hire tax lawyers to appeal for them have a drastically higher chance of trying and succeeding in their appeal (see above figure). Many homeowners who appealed were generally under-assessed compared to homeowners who did not (see figure below). Clearly, the model is part of a deeper institutional pattern rife with potential corruption. </span>
+<span id="cb1-78"><a href="#cb1-78" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-79"><a href="#cb1-79" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;a href = "https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/appeals.html"&gt; &lt;img src = "images/vis_5.png"&gt;&lt;/img&gt;</span>
+<span id="cb1-80"><a href="#cb1-80" aria-hidden="true" tabindex="-1"></a>&lt;/a&gt;&lt;/center&gt;</span>
+<span id="cb1-81"><a href="#cb1-81" aria-hidden="true" tabindex="-1"></a>&lt;br /&gt;</span>
+<span id="cb1-82"><a href="#cb1-82" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-83"><a href="#cb1-83" aria-hidden="true" tabindex="-1"></a>In fact, Chicago boasts a large and thriving tax attorney industry dedicated precisely to appealing property assessments, reflected in the growing number of appeals in Cook County in the 21st century. Given wealthier, whiter neighborhoods typically have greater access to lawyers, they often appealed more and won reductions far more often than their less wealthy neighbors. In other words, those with higher incomes pay less in property tax, tax lawyers can grow their business due to their role in appeals, and politicians are socially connected to the aforementioned tax lawyers and wealthy homeowners. All these stakeholders have reasons to advertise the appeals system as an integral part of a fair system; after all, it serves to benefit them. Here lies the value in asking questions: a system that seems fair on the surface may, in reality, be unfair upon taking a closer look.  </span>
+<span id="cb1-84"><a href="#cb1-84" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-85"><a href="#cb1-85" aria-hidden="true" tabindex="-1"></a><span class="fu">### Human Impacts</span></span>
+<span id="cb1-86"><a href="#cb1-86" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-87"><a href="#cb1-87" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;a href = "https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/assessments.html"&gt; &lt;img src = "images/vis_6.png"&gt;&lt;/img&gt;</span>
+<span id="cb1-88"><a href="#cb1-88" aria-hidden="true" tabindex="-1"></a>&lt;/a&gt;&lt;/center&gt;</span>
+<span id="cb1-89"><a href="#cb1-89" aria-hidden="true" tabindex="-1"></a>&lt;br /&gt;</span>
+<span id="cb1-90"><a href="#cb1-90" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-91"><a href="#cb1-91" aria-hidden="true" tabindex="-1"></a>What happened as a result of this corrupt system? As the *Chicago Tribune* reported, many African American and Latino homeowners purchased homes only to find their houses were later appraised at levels far higher than what they paid. As a result, homeowners were now responsible for paying significantly more in taxes every year than initially budgeted, putting them at risk of not being able to afford their homes and losing them. </span>
+<span id="cb1-92"><a href="#cb1-92" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-93"><a href="#cb1-93" aria-hidden="true" tabindex="-1"></a>The impact of the housing model extends beyond the realm of home ownership and taxation —— the issues of justice go much deeper. This model perpetrated much older patterns of racially discriminatory practices in Chicago and across the United States. Unfortunately, it is no accident that this happened in Chicago, one of the most segregated cities in the United States (<span class="co">[</span><span class="ot">source</span><span class="co">](https://fivethirtyeight.com/features/the-most-diverse-cities-are-often-the-most-segregated/)</span>). These factors are central to informing us, as data scientists, about what is at stake.</span>
+<span id="cb1-94"><a href="#cb1-94" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-95"><a href="#cb1-95" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-96"><a href="#cb1-96" aria-hidden="true" tabindex="-1"></a><span class="fu">### Spotlight: Intersection of Real Estate and Race</span></span>
+<span id="cb1-97"><a href="#cb1-97" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-98"><a href="#cb1-98" aria-hidden="true" tabindex="-1"></a>Before we dive into how the CCAO used data science to "solve" this problem, let's briefly go through the history of discriminatory housing practices in the United States to give more context on the gravity and urgency of this situation. </span>
+<span id="cb1-99"><a href="#cb1-99" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-100"><a href="#cb1-100" aria-hidden="true" tabindex="-1"></a>Housing and real estate, among other factors, have been one of the most significant and enduring drivers of structural racism and racial inequality in the United States since the Civil War. It is one of the main areas where inequalities are created and reproduced. In the early 20th century, <span class="co">[</span><span class="ot">Jim Crow</span><span class="co">](https://www.history.com/topics/early-20th-century-us/jim-crow-laws)</span> laws were explicit in forbidding people of color from utilizing the same facilities —— such as buses, bathrooms, and pools —— as white individuals. This set of practices by government actors in combination with overlapping practices driven by the private real estate industry further served to make neighborhoods increasingly segregated. </span>
+<span id="cb1-101"><a href="#cb1-101" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-102"><a href="#cb1-102" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;a href = "https://dsl.richmond.edu/panorama/redlining/#loc=11/41.84/-87.674"&gt;&lt;img src = "images/vis_7.png"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/center&gt;</span>
+<span id="cb1-103"><a href="#cb1-103" aria-hidden="true" tabindex="-1"></a>&lt;br /&gt;</span>
+<span id="cb1-104"><a href="#cb1-104" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-105"><a href="#cb1-105" aria-hidden="true" tabindex="-1"></a>Although advancements in civil rights have been made, the spirit of the laws is alive in many parts of the US. In the 1920s and 1930s, it was illegal for governments to actively segregate neighborhoods according to race, but other methods were available for achieving the same ends. One of the most notorious practices was **redlining**: the federal housing agencies' process of distinguishing neighborhoods in a city in terms of relative risk. The goal was to increase access to homeownership for low-income Americans. In practice, however, it allowed real estate professionals to legally perpetuate segregation. The federal housing agencies deemed predominantly African American neighborhoods as high risk and colored them in red —— hence the name redlining ——&nbsp;making it nearly impossible for African Americans to own a home.</span>
+<span id="cb1-106"><a href="#cb1-106" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-107"><a href="#cb1-107" aria-hidden="true" tabindex="-1"></a>The origins of the data that made these maps possible lay in a kind of “racial data revolution” in the private real estate industry beginning in the 1920s. Segregation was established and reinforced in part through the work of real estate agents who were also very concerned with establishing reliable methods for predicting the value of a home. The effects of these practices continue to resonate today.</span>
+<span id="cb1-108"><a href="#cb1-108" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-109"><a href="#cb1-109" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src = "images/vis_8.png"&gt;&lt;/img&gt;&lt;figcaption&gt;Source: Colin Koopman, How We Became Our Data (2019) p. 137&lt;/figcaption&gt;&lt;/center&gt;</span>
+<span id="cb1-110"><a href="#cb1-110" aria-hidden="true" tabindex="-1"></a>&lt;br /&gt;</span>
+<span id="cb1-111"><a href="#cb1-111" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-112"><a href="#cb1-112" aria-hidden="true" tabindex="-1"></a><span class="fu">## The Response: Cook County Open Data Initiative</span></span>
+<span id="cb1-113"><a href="#cb1-113" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-114"><a href="#cb1-114" aria-hidden="true" tabindex="-1"></a>The response to this problem started in politics. A new assessor, Fritz Kaegi, was elected and created a new mandate with two goals: </span>
+<span id="cb1-115"><a href="#cb1-115" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-116"><a href="#cb1-116" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Distributional equity in property taxation, meaning that properties of the same value are treated alike during assessments.</span>
+<span id="cb1-117"><a href="#cb1-117" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Creating a new Office of Data Science.</span>
+<span id="cb1-118"><a href="#cb1-118" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-119"><a href="#cb1-119" aria-hidden="true" tabindex="-1"></a>He wanted to not only create a more accurate algorithmic model but also to design a new system to address the problems with the CCAO.</span>
+<span id="cb1-120"><a href="#cb1-120" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-121"><a href="#cb1-121" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src = "images/vis_9.png" width=300px&gt;&lt;/img&gt;&lt;/center&gt;</span>
+<span id="cb1-122"><a href="#cb1-122" aria-hidden="true" tabindex="-1"></a>&lt;br /&gt;</span>
+<span id="cb1-123"><a href="#cb1-123" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-124"><a href="#cb1-124" aria-hidden="true" tabindex="-1"></a>Let's frame this problem through the lens of the data science lifecycle.</span>
+<span id="cb1-125"><a href="#cb1-125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-126"><a href="#cb1-126" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src = "images/data_life_cycle.PNG" width=300px&gt;&lt;/img&gt;&lt;/center&gt;</span>
+<span id="cb1-127"><a href="#cb1-127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-128"><a href="#cb1-128" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-129"><a href="#cb1-129" aria-hidden="true" tabindex="-1"></a><span class="fu">### 1. Question/Problem Formulation</span></span>
+<span id="cb1-130"><a href="#cb1-130" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
+<span id="cb1-131"><a href="#cb1-131" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Driving Questions</span></span>
+<span id="cb1-132"><a href="#cb1-132" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-133"><a href="#cb1-133" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>What do we want to know?</span>
+<span id="cb1-134"><a href="#cb1-134" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>What problems are we trying to solve?</span>
+<span id="cb1-135"><a href="#cb1-135" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>What are the hypotheses we want to test?</span>
+<span id="cb1-136"><a href="#cb1-136" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>What are our metrics for success? </span>
+<span id="cb1-137"><a href="#cb1-137" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-138"><a href="#cb1-138" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-139"><a href="#cb1-139" aria-hidden="true" tabindex="-1"></a>The old system was unfair because it was systemically inaccurate; it made one kind of error for one group, and another kind of error for another. Its goal was to “create a robust pipeline that accurately assesses property values at scale and is fair”, and in turn, they defined fairness as accuracy: “the ability of our pipeline to accurately assess all residential property values, accounting for disparities in geography, information, etc.” Thus, the plan —— make the system more fair —— was already framed in terms of a task appropriate to a data scientist: make the assessments more accurate (or more precisely, minimize errors in a particular way). </span>
+<span id="cb1-140"><a href="#cb1-140" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-141"><a href="#cb1-141" aria-hidden="true" tabindex="-1"></a>The idea here is that if the model is more accurate it will also (perhaps necessarily) become more fair, which is a big assumption. There are, in a sense, two different problems —— make accurate assessments, and make a fair system. Treating these two problems as one makes it a more straightforward issue that can be solved technically (with a good model) but does raise the question of if fairness and accuracy are one and the same.</span>
+<span id="cb1-142"><a href="#cb1-142" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-143"><a href="#cb1-143" aria-hidden="true" tabindex="-1"></a>For now, let’s just talk about the technical part of this —— accuracy. For you, the data scientist, this part might feel more comfortable. We can determine some metrics of success and frame a social problem as a data science problem. </span>
+<span id="cb1-144"><a href="#cb1-144" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-145"><a href="#cb1-145" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip}</span>
+<span id="cb1-146"><a href="#cb1-146" aria-hidden="true" tabindex="-1"></a><span class="fu">## &lt;b&gt;Definitions&lt;/b&gt;: Fairness and Transparency</span></span>
+<span id="cb1-147"><a href="#cb1-147" aria-hidden="true" tabindex="-1"></a>The definitions, as given by the Cook County Assessor's Office, are given below: &lt;br&gt;</span>
+<span id="cb1-148"><a href="#cb1-148" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-149"><a href="#cb1-149" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Fairness: The ability of our pipeline to accurately assess property values, accounting for disparities in geography, information, etc. &lt;br&gt;</span>
+<span id="cb1-150"><a href="#cb1-150" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Transparency: The ability of the data science department to share and explain pipeline results and decisions to both internal and external stakeholders &lt;br&gt;</span>
+<span id="cb1-151"><a href="#cb1-151" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-152"><a href="#cb1-152" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-153"><a href="#cb1-153" aria-hidden="true" tabindex="-1"></a>The new Office of Data Science started by framing the problem and redefining their goals. They determined that they needed to: </span>
+<span id="cb1-154"><a href="#cb1-154" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-155"><a href="#cb1-155" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Accurately, uniformly, and impartially assess the value of a home and accurately predict the sale price of a home within the next year by:</span>
+<span id="cb1-156"><a href="#cb1-156" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Following international standards (e.g., coefficient of dispersion)</span>
+<span id="cb1-157"><a href="#cb1-157" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Predicting the value of all homes with as little total error as possible</span>
+<span id="cb1-158"><a href="#cb1-158" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-159"><a href="#cb1-159" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Create a robust pipeline that accurately assesses property values at scale and is fair to all people by:</span>
+<span id="cb1-160"><a href="#cb1-160" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Disrupting the circuit of corruption (Board of Review appeals process)</span>
+<span id="cb1-161"><a href="#cb1-161" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Eliminating regressivity</span>
+<span id="cb1-162"><a href="#cb1-162" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Engendering trust in the system among all stakeholders </span>
+<span id="cb1-163"><a href="#cb1-163" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-164"><a href="#cb1-164" aria-hidden="true" tabindex="-1"></a>The goals defined above lead us to ask the question: what does it actually mean to accurately assess property values, and what role does “scale” play?</span>
+<span id="cb1-165"><a href="#cb1-165" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-166"><a href="#cb1-166" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>What is an assessment of a home’s value?</span>
+<span id="cb1-167"><a href="#cb1-167" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>What makes one assessment more accurate than another?</span>
+<span id="cb1-168"><a href="#cb1-168" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>What makes one batch of assessments more accurate than another batch?</span>
+<span id="cb1-169"><a href="#cb1-169" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-170"><a href="#cb1-170" aria-hidden="true" tabindex="-1"></a>Each of the above questions leads to a slew of more questions. Considering just the first question, one answer could be that an assessment is an estimate of the value of a home. This leads to more inquiries: what is the value of a home? What determines it? How do we know? For this class, we take it to be the house's market value, or how much it would sell for.</span>
+<span id="cb1-171"><a href="#cb1-171" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-172"><a href="#cb1-172" aria-hidden="true" tabindex="-1"></a>Unfortunately, if you are the county assessor, it becomes hard to determine property values with this definition. After all, you can't make everyone sell their house every year. And as many properties haven’t been sold in decades, every year that passes makes that previous sale less reliable as an indicator.</span>
+<span id="cb1-173"><a href="#cb1-173" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-174"><a href="#cb1-174" aria-hidden="true" tabindex="-1"></a>So how would one generate reliable estimates? You’re probably thinking, well, with data about homes and their sale prices you can probably predict the value of a property reliably. Even if you’re not a data scientist, you might know there are websites like Zillow and RedFin that estimate what properties would sell for and constantly update them. They don’t know the value, but they estimate them. How do you think they do this? Let’s start with the data —— which is the next step in the lifecycle.</span>
+<span id="cb1-175"><a href="#cb1-175" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-176"><a href="#cb1-176" aria-hidden="true" tabindex="-1"></a><span class="fu">### 2. Data Acquisition and Cleaning</span></span>
+<span id="cb1-177"><a href="#cb1-177" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
+<span id="cb1-178"><a href="#cb1-178" aria-hidden="true" tabindex="-1"></a><span class="fu">## Driving Questions</span></span>
+<span id="cb1-179"><a href="#cb1-179" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-180"><a href="#cb1-180" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>What data do we have, and what data do we need?</span>
+<span id="cb1-181"><a href="#cb1-181" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>How will we sample more data?</span>
+<span id="cb1-182"><a href="#cb1-182" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Is our data representative of the population we want to study?</span>
+<span id="cb1-183"><a href="#cb1-183" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-184"><a href="#cb1-184" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-185"><a href="#cb1-185" aria-hidden="true" tabindex="-1"></a>To generate estimates, the data scientists used two datasets. The first contained all recorded sales data from 2013 to 2019. The second contained property characteristics, including a property identification number and physical characteristics (e.g., age, bedroom, baths, square feet, neighborhood, site desirability, etc.).</span>
+<span id="cb1-186"><a href="#cb1-186" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-187"><a href="#cb1-187" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src = "images/vis_10.png"&gt;&lt;/img&gt;&lt;/center&gt;</span>
+<span id="cb1-188"><a href="#cb1-188" aria-hidden="true" tabindex="-1"></a>&lt;br /&gt;</span>
+<span id="cb1-189"><a href="#cb1-189" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-190"><a href="#cb1-190" aria-hidden="true" tabindex="-1"></a>As they examined the datasets, they asked the questions:</span>
+<span id="cb1-191"><a href="#cb1-191" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-192"><a href="#cb1-192" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>How was this data collected?</span>
+<span id="cb1-193"><a href="#cb1-193" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>When was this data collected? </span>
+<span id="cb1-194"><a href="#cb1-194" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Who collected this data?</span>
+<span id="cb1-195"><a href="#cb1-195" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>For what purposes was the data collected?</span>
+<span id="cb1-196"><a href="#cb1-196" aria-hidden="true" tabindex="-1"></a><span class="ss">5. </span>How and why were particular categories created? </span>
+<span id="cb1-197"><a href="#cb1-197" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-198"><a href="#cb1-198" aria-hidden="true" tabindex="-1"></a>With so much data available, data scientists worked to see how all the different data points correlated with each other and with the sales prices. By discovering patterns in datasets containing known sale prices and characteristics of similar and nearby properties, training a model on this data, and applying it to all the properties without sales data, it was now possible to create a linear model that could predict the sale price (“fair market value”) of unsold properties.</span>
+<span id="cb1-199"><a href="#cb1-199" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-200"><a href="#cb1-200" aria-hidden="true" tabindex="-1"></a>Some other key questions data scientists asked about the data were:</span>
+<span id="cb1-201"><a href="#cb1-201" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-202"><a href="#cb1-202" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Are any attributes of a house differentially reported? How might these attributes be differentially reported?</span>
+<span id="cb1-203"><a href="#cb1-203" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>How are "improvements" like renovations tracked and updated?</span>
+<span id="cb1-204"><a href="#cb1-204" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Which data is missing, and for which neighborhoods or populations is it missing?</span>
+<span id="cb1-205"><a href="#cb1-205" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>What other data sources or attributes might be valuable?</span>
+<span id="cb1-206"><a href="#cb1-206" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-207"><a href="#cb1-207" aria-hidden="true" tabindex="-1"></a>Attributes can have different likelihoods of appearing in the data. For example, housing data in the floodplain geographic region of Chicago were less represented than other regions.</span>
+<span id="cb1-208"><a href="#cb1-208" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-209"><a href="#cb1-209" aria-hidden="true" tabindex="-1"></a>Features can also be reported at different rates. Improvements in homes, which tend to increase property value, were unlikely to be reported by the homeowners.</span>
+<span id="cb1-210"><a href="#cb1-210" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-211"><a href="#cb1-211" aria-hidden="true" tabindex="-1"></a>Additionally, they found that there was simply more missing data in lower-income neighborhoods. </span>
+<span id="cb1-212"><a href="#cb1-212" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-213"><a href="#cb1-213" aria-hidden="true" tabindex="-1"></a><span class="fu">### 3. Exploratory Data Analysis</span></span>
+<span id="cb1-214"><a href="#cb1-214" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
+<span id="cb1-215"><a href="#cb1-215" aria-hidden="true" tabindex="-1"></a><span class="fu">## Driving Questions</span></span>
+<span id="cb1-216"><a href="#cb1-216" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-217"><a href="#cb1-217" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>How is our data organized, and what does it contain?</span>
+<span id="cb1-218"><a href="#cb1-218" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Do we already have relevant data?</span>
+<span id="cb1-219"><a href="#cb1-219" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>What are the biases, anomalies, or other issues with the data?</span>
+<span id="cb1-220"><a href="#cb1-220" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>How do we transform the data to enable effective analysis?</span>
+<span id="cb1-221"><a href="#cb1-221" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-222"><a href="#cb1-222" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-223"><a href="#cb1-223" aria-hidden="true" tabindex="-1"></a>Before the modeling step, they investigated a multitude of crucial questions:  </span>
+<span id="cb1-224"><a href="#cb1-224" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-225"><a href="#cb1-225" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Which attributes are most predictive of sales price?</span>
+<span id="cb1-226"><a href="#cb1-226" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Is the data uniformly distributed? </span>
+<span id="cb1-227"><a href="#cb1-227" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Do all neighborhoods have recent data? Do all neighborhoods have the same granularity?  </span>
+<span id="cb1-228"><a href="#cb1-228" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>Do some neighborhoods have missing or outdated data? </span>
+<span id="cb1-229"><a href="#cb1-229" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-230"><a href="#cb1-230" aria-hidden="true" tabindex="-1"></a>They found that certain features, such as bedroom number, were much more useful in determining house value for certain neighborhoods than for others. This informed them that different models should be used depending on the neighborhood.</span>
+<span id="cb1-231"><a href="#cb1-231" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-232"><a href="#cb1-232" aria-hidden="true" tabindex="-1"></a>They also noticed that low-income neighborhoods had disproportionately spottier data. This informed them that they needed to develop new data collection practices - including finding new sources of data. </span>
+<span id="cb1-233"><a href="#cb1-233" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-234"><a href="#cb1-234" aria-hidden="true" tabindex="-1"></a><span class="fu">### 4. Prediction and Inference</span></span>
+<span id="cb1-235"><a href="#cb1-235" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
+<span id="cb1-236"><a href="#cb1-236" aria-hidden="true" tabindex="-1"></a><span class="fu">## Driving Questions</span></span>
+<span id="cb1-237"><a href="#cb1-237" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-238"><a href="#cb1-238" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>What does the data say about the world?</span>
+<span id="cb1-239"><a href="#cb1-239" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Does it answer our questions or accurately solve the problem?</span>
+<span id="cb1-240"><a href="#cb1-240" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>How robust are our conclusions, and can we trust the predictions? </span>
+<span id="cb1-241"><a href="#cb1-241" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-242"><a href="#cb1-242" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-243"><a href="#cb1-243" aria-hidden="true" tabindex="-1"></a>Rather than using a singular model to predict sale prices (“fair market value”) of unsold properties, the CCAO predicts sale prices using machine learning models that discover patterns in data sets containing known sale prices and characteristics of **similar and nearby properties**. It uses different model weights for each neighborhood.</span>
+<span id="cb1-244"><a href="#cb1-244" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-245"><a href="#cb1-245" aria-hidden="true" tabindex="-1"></a>Compared to traditional mass appraisal, the CCAO’s new approach is more granular and more sensitive to neighborhood variations. </span>
+<span id="cb1-246"><a href="#cb1-246" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-247"><a href="#cb1-247" aria-hidden="true" tabindex="-1"></a>But how do we know if an assessment is accurate? We can see how our model performs when predicting the sales prices of properties it wasn't trained on! We can then evaluate how "close" our estimate was to the actual sales price, using Root Mean Square Error (RMSE). However, is RMSE a good proxy for fairness in this context?</span>
+<span id="cb1-248"><a href="#cb1-248" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-249"><a href="#cb1-249" aria-hidden="true" tabindex="-1"></a>Broad metrics of error like RMSE can be limiting when evaluating the "fairness" of a property appraisal system. RMSE does not tell us anything about the distribution of errors, whether the errors are positive or negative, and the relative size of the errors. It does not tell us anything about the regressivity of the model, instead just giving a rough measure of our model's overall error. </span>
+<span id="cb1-250"><a href="#cb1-250" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-251"><a href="#cb1-251" aria-hidden="true" tabindex="-1"></a>Even with a low RMSE, we can't guarantee a fair model. The error we see (no matter how small) may be a result of our model overvaluing less expensive homes and undervaluing more expensive homes. </span>
+<span id="cb1-252"><a href="#cb1-252" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-253"><a href="#cb1-253" aria-hidden="true" tabindex="-1"></a>Regarding accuracy, it's important to ask what makes a batch of assessments better or more accurate than another batch of assessments. The value of a home that a model predicts is relational. It's a product of the interaction of social and technical elements so property assessment involves social trust.</span>
+<span id="cb1-254"><a href="#cb1-254" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-255"><a href="#cb1-255" aria-hidden="true" tabindex="-1"></a>Why should any particular individual believe that the model is accurate for their property? Why should any individual trust the model?</span>
+<span id="cb1-256"><a href="#cb1-256" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-257"><a href="#cb1-257" aria-hidden="true" tabindex="-1"></a>To foster public trust, the CCAO focuses on “transparency”, putting data, models, and the pipeline onto GitLab. By doing so, they can better equate the production of “accurate assessments” with “fairness”.</span>
+<span id="cb1-258"><a href="#cb1-258" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-259"><a href="#cb1-259" aria-hidden="true" tabindex="-1"></a>There's a lot more to be said here on the relationship between accuracy, fairness, and metrics we tend to use when evaluating our models. Given the nuanced nature of the argument, it is recommended you view the corresponding lecture as the course notes are not as comprehensive for this portion of the lecture.</span>
+<span id="cb1-260"><a href="#cb1-260" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-261"><a href="#cb1-261" aria-hidden="true" tabindex="-1"></a><span class="fu">### 5. Results and Conclusions</span></span>
+<span id="cb1-262"><a href="#cb1-262" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
+<span id="cb1-263"><a href="#cb1-263" aria-hidden="true" tabindex="-1"></a><span class="fu">## Driving Questions</span></span>
+<span id="cb1-264"><a href="#cb1-264" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-265"><a href="#cb1-265" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>How successful is the system for each goal?</span>
+<span id="cb1-266"><a href="#cb1-266" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Accuracy/uniformity of the model</span>
+<span id="cb1-267"><a href="#cb1-267" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Fairness and transparency that eliminates regressivity and engenders trust</span>
+<span id="cb1-268"><a href="#cb1-268" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>How do you know? </span>
+<span id="cb1-269"><a href="#cb1-269" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-270"><a href="#cb1-270" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-271"><a href="#cb1-271" aria-hidden="true" tabindex="-1"></a>Unfortunately, it may be naive to hope that a more accurate and transparent algorithm will translate into more fair outcomes in practice. Even if our model is perfectly optimized according to the standards of fairness we've set, there is no guarantee that people will actually pay their expected share of taxes as determined by the model. While it is a good step in the right direction, maintaining a level of social trust is key to ensuring people pay their fair share. </span>
+<span id="cb1-272"><a href="#cb1-272" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-273"><a href="#cb1-273" aria-hidden="true" tabindex="-1"></a>Despite all their best efforts, the CCAO is still struggling to create fair assessments and engender trust. </span>
+<span id="cb1-274"><a href="#cb1-274" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-275"><a href="#cb1-275" aria-hidden="true" tabindex="-1"></a>Stories like <span class="co">[</span><span class="ot">the one</span><span class="co">](https://www.axios.com/local/chicago/2022/12/01/why-chicagos-property-tax-bills-so-high)</span> show that total taxes for residential properties went up overall (because commercial taxes went down). But looking at the distribution, we can see that the biggest increases occurred in wealthy neighborhoods, and the biggest decreases occurred in poorer, predominantly Black neighborhoods. So maybe there was some success after all? </span>
+<span id="cb1-276"><a href="#cb1-276" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-277"><a href="#cb1-277" aria-hidden="true" tabindex="-1"></a>However, it'll ultimately be hard to overcome the propensity of the board of review to reduce the tax burden of the rich, preventing the CCAO from creating a truly fair system. This is in part because there are many cases where the model makes big, frustrating mistakes. In some cases like <span class="co">[</span><span class="ot">this one</span><span class="co">](https://www.axios.com/local/chicago/2023/05/22/cook-county-property-tax-appeal-process)</span>, it is due to spotty data. </span>
+<span id="cb1-278"><a href="#cb1-278" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-279"><a href="#cb1-279" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-280"><a href="#cb1-280" aria-hidden="true" tabindex="-1"></a><span class="fu">## Summary: Questions to Consider</span></span>
+<span id="cb1-281"><a href="#cb1-281" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-282"><a href="#cb1-282" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Question/Problem Formulation</span>
+<span id="cb1-283"><a href="#cb1-283" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-284"><a href="#cb1-284" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Who is responsible for framing the problem?</span>
+<span id="cb1-285"><a href="#cb1-285" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Who are the stakeholders? How are they involved in the problem framing?</span>
+<span id="cb1-286"><a href="#cb1-286" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>What do you bring to the table? How does your positionality affect your understanding of the problem?</span>
+<span id="cb1-287"><a href="#cb1-287" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>What are the narratives that you're tapping into? </span>
+<span id="cb1-288"><a href="#cb1-288" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-289"><a href="#cb1-289" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Data Acquisition and Cleaning</span>
+<span id="cb1-290"><a href="#cb1-290" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-291"><a href="#cb1-291" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Where does the data come from?</span>
+<span id="cb1-292"><a href="#cb1-292" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Who collected it? For what purpose?</span>
+<span id="cb1-293"><a href="#cb1-293" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>What kinds of collecting and recording systems and techniques were used? </span>
+<span id="cb1-294"><a href="#cb1-294" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>How has this data been used in the past?</span>
+<span id="cb1-295"><a href="#cb1-295" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>What restrictions are there on access to the data, and what enables you to have access?</span>
+<span id="cb1-296"><a href="#cb1-296" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-297"><a href="#cb1-297" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Exploratory Data Analysis &amp; Visualization</span>
+<span id="cb1-298"><a href="#cb1-298" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-299"><a href="#cb1-299" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>What kind of personal or group identities have become salient in this data? </span>
+<span id="cb1-300"><a href="#cb1-300" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Which variables became salient, and what kinds of relationships do we see between them? </span>
+<span id="cb1-301"><a href="#cb1-301" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Do any of the relationships made visible lend themselves to arguments that might be potentially harmful to a particular community?</span>
+<span id="cb1-302"><a href="#cb1-302" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-303"><a href="#cb1-303" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>Prediction and Inference</span>
+<span id="cb1-304"><a href="#cb1-304" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-305"><a href="#cb1-305" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>What does the prediction or inference do in the world?</span>
+<span id="cb1-306"><a href="#cb1-306" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Are the results useful for the intended purposes?</span>
+<span id="cb1-307"><a href="#cb1-307" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Are there benchmarks to compare the results?</span>
+<span id="cb1-308"><a href="#cb1-308" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>How are your predictions and inferences dependent upon the larger system in which your model works?</span>
+<span id="cb1-309"><a href="#cb1-309" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-310"><a href="#cb1-310" aria-hidden="true" tabindex="-1"></a><span class="fu">## Key Takeaways</span></span>
+<span id="cb1-311"><a href="#cb1-311" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-312"><a href="#cb1-312" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Accuracy is a necessary, but not sufficient, condition of a fair system.</span>
+<span id="cb1-313"><a href="#cb1-313" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Fairness and transparency are context-dependent and **sociotechnical** concepts.</span>
+<span id="cb1-314"><a href="#cb1-314" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Learn to work with contexts, and consider how your data analysis will reshape them.</span>
+<span id="cb1-315"><a href="#cb1-315" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>Keep in mind the power, and limits, of data analysis.</span></code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/case_study_HCE/images/data_life_cycle.PNG b/docs/case_study_HCE/images/data_life_cycle.PNG
new file mode 100644
index 000000000..aef5d21de
Binary files /dev/null and b/docs/case_study_HCE/images/data_life_cycle.PNG differ
diff --git a/docs/case_study_HCE/images/vis_1.png b/docs/case_study_HCE/images/vis_1.png
new file mode 100644
index 000000000..a9ecac7b3
Binary files /dev/null and b/docs/case_study_HCE/images/vis_1.png differ
diff --git a/docs/case_study_HCE/images/vis_10.png b/docs/case_study_HCE/images/vis_10.png
new file mode 100644
index 000000000..61daefb9d
Binary files /dev/null and b/docs/case_study_HCE/images/vis_10.png differ
diff --git a/docs/case_study_HCE/images/vis_2.png b/docs/case_study_HCE/images/vis_2.png
new file mode 100644
index 000000000..db39da9e0
Binary files /dev/null and b/docs/case_study_HCE/images/vis_2.png differ
diff --git a/docs/case_study_HCE/images/vis_3.jpg b/docs/case_study_HCE/images/vis_3.jpg
new file mode 100644
index 000000000..72e645396
Binary files /dev/null and b/docs/case_study_HCE/images/vis_3.jpg differ
diff --git a/docs/case_study_HCE/images/vis_4.png b/docs/case_study_HCE/images/vis_4.png
new file mode 100644
index 000000000..472809dfc
Binary files /dev/null and b/docs/case_study_HCE/images/vis_4.png differ
diff --git a/docs/case_study_HCE/images/vis_5.png b/docs/case_study_HCE/images/vis_5.png
new file mode 100644
index 000000000..74853eb27
Binary files /dev/null and b/docs/case_study_HCE/images/vis_5.png differ
diff --git a/docs/case_study_HCE/images/vis_6.png b/docs/case_study_HCE/images/vis_6.png
new file mode 100644
index 000000000..60d63cfb5
Binary files /dev/null and b/docs/case_study_HCE/images/vis_6.png differ
diff --git a/docs/case_study_HCE/images/vis_7.png b/docs/case_study_HCE/images/vis_7.png
new file mode 100644
index 000000000..ed490433d
Binary files /dev/null and b/docs/case_study_HCE/images/vis_7.png differ
diff --git a/docs/case_study_HCE/images/vis_8.png b/docs/case_study_HCE/images/vis_8.png
new file mode 100644
index 000000000..e2ebc46b4
Binary files /dev/null and b/docs/case_study_HCE/images/vis_8.png differ
diff --git a/docs/case_study_HCE/images/vis_9.png b/docs/case_study_HCE/images/vis_9.png
new file mode 100644
index 000000000..aab375803
Binary files /dev/null and b/docs/case_study_HCE/images/vis_9.png differ
diff --git a/docs/clustering/clustering.html b/docs/clustering/clustering.html
new file mode 100644
index 000000000..c1feae485
--- /dev/null
+++ b/docs/clustering/clustering.html
@@ -0,0 +1,1116 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>26&nbsp; Clustering – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../pca_2/pca_2.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../clustering/clustering.html"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#review-taxonomy-of-machine-learning" id="toc-review-taxonomy-of-machine-learning" class="nav-link active" data-scroll-target="#review-taxonomy-of-machine-learning"><span class="header-section-number">26.1</span> Review: Taxonomy of Machine Learning</a>
+  <ul>
+  <li><a href="#supervised-learning" id="toc-supervised-learning" class="nav-link" data-scroll-target="#supervised-learning"><span class="header-section-number">26.1.1</span> Supervised Learning</a></li>
+  <li><a href="#unsupervised-learning" id="toc-unsupervised-learning" class="nav-link" data-scroll-target="#unsupervised-learning"><span class="header-section-number">26.1.2</span> Unsupervised Learning</a></li>
+  <li><a href="#clustering-examples" id="toc-clustering-examples" class="nav-link" data-scroll-target="#clustering-examples"><span class="header-section-number">26.1.3</span> Clustering Examples</a>
+  <ul>
+  <li><a href="#example-1" id="toc-example-1" class="nav-link" data-scroll-target="#example-1"><span class="header-section-number">26.1.3.1</span> Example 1</a></li>
+  <li><a href="#example-2-netflix" id="toc-example-2-netflix" class="nav-link" data-scroll-target="#example-2-netflix"><span class="header-section-number">26.1.3.2</span> Example 2: Netflix</a></li>
+  <li><a href="#example-3-education" id="toc-example-3-education" class="nav-link" data-scroll-target="#example-3-education"><span class="header-section-number">26.1.3.3</span> Example 3: Education</a></li>
+  <li><a href="#example-4-reverse-engineering-biology" id="toc-example-4-reverse-engineering-biology" class="nav-link" data-scroll-target="#example-4-reverse-engineering-biology"><span class="header-section-number">26.1.3.4</span> Example 4: Reverse Engineering Biology</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#taxonomy-of-clustering-approaches" id="toc-taxonomy-of-clustering-approaches" class="nav-link" data-scroll-target="#taxonomy-of-clustering-approaches"><span class="header-section-number">26.2</span> Taxonomy of Clustering Approaches</a></li>
+  <li><a href="#k-means-clustering" id="toc-k-means-clustering" class="nav-link" data-scroll-target="#k-means-clustering"><span class="header-section-number">26.3</span> K-Means Clustering</a>
+  <ul>
+  <li><a href="#a-quick-note" id="toc-a-quick-note" class="nav-link" data-scroll-target="#a-quick-note"><span class="header-section-number">26.3.0.1</span> A Quick Note</a></li>
+  </ul></li>
+  <li><a href="#minimizing-inertia" id="toc-minimizing-inertia" class="nav-link" data-scroll-target="#minimizing-inertia"><span class="header-section-number">26.4</span> Minimizing Inertia</a></li>
+  <li><a href="#hierarchical-agglomerative-clustering" id="toc-hierarchical-agglomerative-clustering" class="nav-link" data-scroll-target="#hierarchical-agglomerative-clustering"><span class="header-section-number">26.5</span> Hierarchical Agglomerative Clustering</a>
+  <ul>
+  <li><a href="#clustering-dendrograms-and-intuition" id="toc-clustering-dendrograms-and-intuition" class="nav-link" data-scroll-target="#clustering-dendrograms-and-intuition"><span class="header-section-number">26.5.1</span> Clustering, Dendrograms, and Intuition</a></li>
+  </ul></li>
+  <li><a href="#picking-k" id="toc-picking-k" class="nav-link" data-scroll-target="#picking-k"><span class="header-section-number">26.6</span> Picking K</a>
+  <ul>
+  <li><a href="#silhouette-scores" id="toc-silhouette-scores" class="nav-link" data-scroll-target="#silhouette-scores"><span class="header-section-number">26.6.1</span> Silhouette Scores</a></li>
+  <li><a href="#silhouette-plot" id="toc-silhouette-plot" class="nav-link" data-scroll-target="#silhouette-plot"><span class="header-section-number">26.6.2</span> Silhouette Plot</a></li>
+  <li><a href="#picking-k-real-world-metrics" id="toc-picking-k-real-world-metrics" class="nav-link" data-scroll-target="#picking-k-real-world-metrics"><span class="header-section-number">26.6.3</span> Picking K: Real World Metrics</a></li>
+  </ul></li>
+  <li><a href="#conclusion" id="toc-conclusion" class="nav-link" data-scroll-target="#conclusion"><span class="header-section-number">26.7</span> Conclusion</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Introduction to clustering</li>
+<li>Assessing the taxonomy of clustering approaches</li>
+<li>K-Means clustering</li>
+<li>Clustering with no explicit loss function: minimizing inertia</li>
+<li>Hierarchical Agglomerative Clustering</li>
+<li>Picking K: a hyperparameter</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Last time, we began our journey into unsupervised learning by discussing Principal Component Analysis (PCA).</p>
+<p>In this lecture, we will explore another very popular unsupervised learning concept: clustering. Clustering allows us to “group” similar datapoints together without being given labels of what “class” or where each point explicitly comes from. We will discuss two clustering algorithms: K-Means clustering and hierarchical agglomerative clustering, and we’ll examine the assumptions, strengths, and drawbacks of each one.</p>
+<section id="review-taxonomy-of-machine-learning" class="level2" data-number="26.1">
+<h2 data-number="26.1" class="anchored" data-anchor-id="review-taxonomy-of-machine-learning"><span class="header-section-number">26.1</span> Review: Taxonomy of Machine Learning</h2>
+<section id="supervised-learning" class="level3" data-number="26.1.1">
+<h3 data-number="26.1.1" class="anchored" data-anchor-id="supervised-learning"><span class="header-section-number">26.1.1</span> Supervised Learning</h3>
+<p>In supervised learning, our goal is to create a function that maps inputs to outputs. Each model is learned from example input/output pairs (training set), validated using input/output pairs, and eventually tested on more input/output pairs. Each pair consists of:</p>
+<ul>
+<li>Input vector</li>
+<li>Output value (<strong>label</strong>)</li>
+</ul>
+<p>In regression, our output value is quantitative, and in classification, our output value is categorical.</p>
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="images/ml_taxonomy.png" class="img-fluid figure-img"></p>
+<figcaption>ML taxonomy</figcaption>
+</figure>
+</div>
+</section>
+<section id="unsupervised-learning" class="level3" data-number="26.1.2">
+<h3 data-number="26.1.2" class="anchored" data-anchor-id="unsupervised-learning"><span class="header-section-number">26.1.2</span> Unsupervised Learning</h3>
+<p>In unsupervised learning, our goal is to identify patterns in <strong>unlabeled</strong> data. In this type of learning, we do not have input/output pairs. Sometimes, we may have labels but choose to ignore them (e.g.&nbsp;PCA on labeled data). Instead, we are more interested in the inherent structure of the data we have rather than trying to simply predict a label using that structure of data. For example, if we are interested in dimensionality reduction, we can use PCA to reduce our data to a lower dimension.</p>
+<p>Now, let’s consider a new problem: clustering.</p>
+</section>
+<section id="clustering-examples" class="level3" data-number="26.1.3">
+<h3 data-number="26.1.3" class="anchored" data-anchor-id="clustering-examples"><span class="header-section-number">26.1.3</span> Clustering Examples</h3>
+<section id="example-1" class="level4" data-number="26.1.3.1">
+<h4 data-number="26.1.3.1" class="anchored" data-anchor-id="example-1"><span class="header-section-number">26.1.3.1</span> Example 1</h4>
+<p>Consider this figure from Fall 2019 Midterm 2. The original dataset had 8 dimensions, but we have used PCA to reduce our data down to 2 dimensions.</p>
+<center>
+<img src="images/blobs.png" alt="blobs" width="600">
+</center>
+<p>Each point represents the 1st and 2nd principal component of how much time patrons spent at 8 different zoo exhibits. Visually and intuitively, we could potentially guess that this data belongs to 3 groups: one for each cluster. The goal of clustering is now to assign each point (in the 2 dimensional PCA representation) to a cluster.</p>
+<center>
+<img src="images/clusters_ex1.png" alt="clusters_ex1" width="600">
+</center>
+<p>This is an unsupervised task, as:</p>
+<ul>
+<li>We don’t have labels for each visitor.</li>
+<li>We want to infer patterns even without labels.</li>
+</ul>
+</section>
+<section id="example-2-netflix" class="level4" data-number="26.1.3.2">
+<h4 data-number="26.1.3.2" class="anchored" data-anchor-id="example-2-netflix"><span class="header-section-number">26.1.3.2</span> Example 2: Netflix</h4>
+<p>Now suppose you’re Netflix and are looking at information on customer viewing habits. Clustering can come in handy here. We can assign each person or show to a “cluster.” (Note: while we don’t know for sure that Netflix actually uses ML clustering to identify these categories, they could, in principle.)</p>
+<p>Keep in mind that with clustering, we don’t need to define clusters in advance; it discovers groups automatically. On the other hand, with classification, we have to decide labels in advance. This marks one of the key differences between clustering and classification.</p>
+</section>
+<section id="example-3-education" class="level4" data-number="26.1.3.3">
+<h4 data-number="26.1.3.3" class="anchored" data-anchor-id="example-3-education"><span class="header-section-number">26.1.3.3</span> Example 3: Education</h4>
+<p>Let’s say we’re working with student-generated materials and pass them into the S-BERT module to extract sentence embeddings. Features from clusters are extracted to:</p>
+<ol type="1">
+<li>Detect anomalies in group activities</li>
+<li>Predict the group’s median quiz grade</li>
+</ol>
+<center>
+<img src="images/outline-ex3.png" alt="outline-ex3" width="600">
+</center>
+<p>Here we can see the outline of the anomaly detection module. It consists of:</p>
+<ul>
+<li>S-BERT feature extraction</li>
+<li>Topic extraction</li>
+<li>Feature extraction</li>
+<li>16D <span class="math inline">\(\rightarrow\)</span> 2D PCA dimensionality reduction and 2D <span class="math inline">\(\rightarrow\)</span> 16D reconstruction</li>
+<li>Anomaly detection based on reconstruction error</li>
+</ul>
+<p>Looking more closely at our clustering, we can better understand the different components, which are represented by the centers. Below we have two examples.</p>
+<center>
+<img src="images/components.png" alt="components" width="800">
+</center>
+<p>Note that the details for this example are not in scope.</p>
+</section>
+<section id="example-4-reverse-engineering-biology" class="level4" data-number="26.1.3.4">
+<h4 data-number="26.1.3.4" class="anchored" data-anchor-id="example-4-reverse-engineering-biology"><span class="header-section-number">26.1.3.4</span> Example 4: Reverse Engineering Biology</h4>
+<p>Now, consider the plot below:</p>
+<center>
+<img src="images/genes.png" alt="genes" width="600">
+</center>
+<p>The rows of this plot are conditions (e.g., a row might be: “poured acid on the cells”), and the columns are genes. The green coloration indicates that the gene was “off” whereas red indicates the gene was “on”. For example, the ~9 genes in the top left corner of the plot were all turned off by the 6 experiments (rows) at the top.</p>
+<p>In a clustering lens, we might be interested in clustering similar observations together based on the reactions (on/off) to certain experiments.</p>
+<p>For example, here is a look at our data before and after clustering.</p>
+<center>
+<img src="images/beforeandafter4.png" alt="beforeandafter4" width="400">
+</center>
+<p>Note: apologies if you can’t differentiate red from green by eye! Historical visualizations are not always the best.</p>
+</section>
+</section>
+</section>
+<section id="taxonomy-of-clustering-approaches" class="level2" data-number="26.2">
+<h2 data-number="26.2" class="anchored" data-anchor-id="taxonomy-of-clustering-approaches"><span class="header-section-number">26.2</span> Taxonomy of Clustering Approaches</h2>
+<center>
+<img src="images/taxonomy.png" alt="taxonomy" width="600">
+</center>
+<p>There are many types of clustering algorithms, and they all have strengths, inherent weaknesses, and different use cases. We will first focus on a partitional approach: K-Means clustering.</p>
+</section>
+<section id="k-means-clustering" class="level2" data-number="26.3">
+<h2 data-number="26.3" class="anchored" data-anchor-id="k-means-clustering"><span class="header-section-number">26.3</span> K-Means Clustering</h2>
+<p>The most popular clustering approach is K-Means. The algorithm itself entails the following:</p>
+<ol type="1">
+<li><p>Pick an arbitrary <span class="math inline">\(k\)</span>, and randomly place <span class="math inline">\(k\)</span> “centers”, each a different color.</p></li>
+<li><p>Repeat until convergence:</p>
+<ol type="a">
+<li>Color points according to the closest center.</li>
+<li>Move the center for each color to the center of points with that color.</li>
+</ol></li>
+</ol>
+<p>Consider the following data with an arbitrary <span class="math inline">\(k = 2\)</span> and randomly placed “centers” denoted by the different colors (blue, orange):</p>
+<center>
+<img src="images/init_cluster.png" alt="init_cluster" width="600">
+</center>
+<p>Now, we will follow the rest of the algorithm. First, let us color each point according to the closest center:</p>
+<center>
+<img src="images/cluster_class.png" alt="cluster_class" width="600">
+</center>
+<p>Next, we will move the center for each color to the center of points with that color. Notice how the centers are generally well-centered amongst the data that shares its color.</p>
+<center>
+<img src="images/cluster_iter1.png" alt="cluster_iter1" width="600">
+</center>
+<p>Assume this process (re-color and re-set centers) repeats for a few more iterations. We eventually reach this state.</p>
+<center>
+<img src="images/cluster_iter5.png" alt="cluster_iter5" width="600">
+</center>
+<p>After this iteration, the center stays still and does not move at all. Thus, we have converged, and the clustering is complete!</p>
+<section id="a-quick-note" class="level4" data-number="26.3.0.1">
+<h4 data-number="26.3.0.1" class="anchored" data-anchor-id="a-quick-note"><span class="header-section-number">26.3.0.1</span> A Quick Note</h4>
+<p>K-Means is a completely different algorithm than K-Nearest Neighbors. K-means is used for <em>clustering</em>, where each point is assigned to one of <span class="math inline">\(K\)</span> clusters. On the other hand, K-Nearest Neighbors is used for <em>classification</em> (or, less often, regression), and the predicted value is typically the most common class among the <span class="math inline">\(K\)</span>-nearest data points in the training set. The names may be similar, but there isn’t really anything in common.</p>
+</section>
+</section>
+<section id="minimizing-inertia" class="level2" data-number="26.4">
+<h2 data-number="26.4" class="anchored" data-anchor-id="minimizing-inertia"><span class="header-section-number">26.4</span> Minimizing Inertia</h2>
+<p>Consider the following example where <span class="math inline">\(K = 4\)</span>:</p>
+<center>
+<img src="images/four_cluster.png" alt="four_cluster" width="500">
+</center>
+<p>Due to the randomness of where the <span class="math inline">\(K\)</span> centers initialize/start, you will get a different output/clustering every time you run K-Means. Consider three possible K-Means outputs; the algorithm has converged, and the colors denote the final cluster they are clustered as.</p>
+<center>
+<img src="images/random_outputs.png" alt="random_outputs" width="800">
+</center>
+<p><br> Which clustering output is the best? To evaluate different clustering results, we need a loss function.</p>
+<p>The two common loss functions are:</p>
+<ul>
+<li><strong>Inertia</strong>: Sum of squared distances from each data point to its center.</li>
+<li><strong>Distortion</strong>: Weighted sum of squared distances from each data point to its center.</li>
+</ul>
+<center>
+<img src="images/inertia_distortion.png" alt="inertia_distortion" width="500">
+</center>
+<p>In the example above:</p>
+<ul>
+<li>Calculated inertia: <span class="math inline">\(0.47^2 + 0.19^2 + 0.34^2 + 0.25^2 + 0.58^2 + 0.36^2 + 0.44^2\)</span></li>
+<li>Calculated distortion: <span class="math inline">\(\frac{0.47^2 + 0.19^2 + 0.34^2}{3} + \frac{0.25^2 + 0.58^2 + 0.36^2 + 0.44^2}{4}\)</span></li>
+</ul>
+<p>Switching back to the four-cluster example at the beginning of this section, <code>random.seed(25)</code> had an inertia of <code>44.96</code>, <code>random.seed(29)</code> had an inertia of <code>45.95</code>, and <code>random.seed(40)</code> had an inertia of <code>54.35</code>. It seems that the best clustering output was <code>random.seed(25)</code> with an inertia of <code>44.96</code>!</p>
+<p>It turns out that the function K-Means is trying to minimize is inertia, but often fails to find global optimum. Why does this happen? We can think of K-means as a pair of optimizers that take turns. The first optimizer holds <em>center positions</em> constant and optimizes <em>data colors</em>. The second optimizer holds <em>data colors</em> constant and optimizes <em>center positions</em>. Neither optimizer gets full control!</p>
+<p>This is a hard problem: give an algorithm that optimizes inertia FOR A GIVEN <span class="math inline">\(K\)</span>; <span class="math inline">\(K\)</span> is picked in advance. Your algorithm should return the EXACT best centers and colors, but you don’t need to worry about runtime.</p>
+<p><em>Note: This is a bit of a CS61B/CS70/CS170 problem, so do not worry about completely understanding the tricky predicament we are in too much!</em></p>
+<p>A potential algorithm:</p>
+<ul>
+<li>For all possible <span class="math inline">\(k^n\)</span> colorings:
+<ul>
+<li>Compute the <span class="math inline">\(k\)</span> centers for that coloring.</li>
+<li>Compute the inertia for the <span class="math inline">\(k\)</span> centers.
+<ul>
+<li>If current inertia is better than best known, write down the current centers and coloring and call that the new best known.</li>
+</ul></li>
+</ul></li>
+</ul>
+<p>No better algorithm has been found for solving the problem of minimizing inertia exactly.</p>
+</section>
+<section id="hierarchical-agglomerative-clustering" class="level2" data-number="26.5">
+<h2 data-number="26.5" class="anchored" data-anchor-id="hierarchical-agglomerative-clustering"><span class="header-section-number">26.5</span> Hierarchical Agglomerative Clustering</h2>
+<p>Now, let us consider hierarchical agglomerative clustering.</p>
+<center>
+<img src="images/hierarchical_approach.png" alt="hierarchical_approach" width="600">
+</center>
+<p><br> Consider the following results of two K-Means clustering outputs:</p>
+<center>
+<img src="images/clustering_comparison.png" alt="clustering_comparison" width="600">
+</center>
+<p><br> Which clustering result do you like better? It seems K-Means likes the one on the right better because it has lower inertia (the sum of squared distances from each data point to its center), but this raises some questions:</p>
+<ul>
+<li>Why is the inertia on the right lower? K-Means optimizes for distance, not “blobbiness”.</li>
+<li>Is clustering on the right “wrong”? Good question!</li>
+</ul>
+<p>Now, let us introduce Hierarchical Agglomerative Clustering! We start with every data point in a separate cluster, and we’ll keep merging the most similar pairs of data points/clusters until we have one big cluster left. This is called a <strong>bottom-up</strong> or <strong>agglomerative method</strong>.</p>
+<p>There are various ways to decide the order of combining clusters called <strong>Linkage Criterion</strong>:</p>
+<ul>
+<li><strong>Single linkage</strong> (similarity of the most similar): the distance between two clusters as the <strong>minimum</strong> distance between a point in the first cluster and a point in the second.</li>
+<li><strong>Complete linkage</strong> (similarity of the least similar): the distance between two clusters as the <strong>maximum</strong> distance between a point in the first cluster and a point in the second.</li>
+<li><strong>Average linkage</strong>: <strong>average</strong> similarity of pairs of points in clusters.</li>
+</ul>
+<p>The linkage criterion decides how we measure the “distance” between two clusters. Regardless of the criterion we choose, the aim is to combine the two clusters that have the minimum “distance” between them, with the distance computed as per that criterion. In the case of complete linkage, for example, that means picking the two clusters that minimize the maximum distance between a point in the first cluster and a point in the second.</p>
+<center>
+<img src="images/linkage.png" alt="linkage" width="600">
+</center>
+<p>When the algorithm starts, every data point is in its own cluster. In the plot below, there are 12 data points, so the algorithm starts with 12 clusters. As the clustering begins, it assesses which clusters are the closest together.</p>
+<center>
+<img src="images/agg1.png" alt="agg1" width="400">
+</center>
+<p>The closest clusters are 10 and 11, so they are merged together.</p>
+<center>
+<img src="images/agg2.png" alt="agg2" width="400">
+</center>
+<p>Next, points 0 and 4 are merged together because they are closest.</p>
+<center>
+<img src="images/agg3.png" alt="agg3" width="400">
+</center>
+<p>At this point, we have 10 clusters: 8 with a single point (clusters 1, 2, 3, 4, 5, 6, 7, 8, and 9) and 2 with 2 points (clusters 0 and 10).</p>
+<p>Although clusters 0 and 3 are not the closest, let us consider if we were trying to merge them. A tricky question arises: what is the “distance” between clusters 0 and 3? We can use the <strong>Complete-Link</strong> approach that uses the <strong>max</strong> distance among all pairs of points between groups to decide which group has smaller “distance”.</p>
+<center>
+<img src="images/agg4.png" alt="agg4" width="400">
+</center>
+<p>Let us assume the algorithm runs a little longer, and we have reached the following state. Clusters 0 and 7 are up next, but why? The <strong>max line between any member of 0 and 6</strong> is longer than the <strong>max line between any member of 0 and 7</strong>.</p>
+<center>
+<img src="images/agg5.png" alt="agg5" width="600">
+</center>
+<p>Thus, 0 and 7 are merged into 0 as they are closer under the complete linkage criterion.</p>
+<p>After more iterations, we finally converge to the plot on the left. There are two clusters (0, 1), and the agglomerative algorithm has converged.</p>
+<center>
+<img src="images/agg6.png" alt="agg6" width="700">
+</center>
+<p><br> Notice that on the full dataset, our agglomerative clustering algorithm achieves the more “correct” output.</p>
+<section id="clustering-dendrograms-and-intuition" class="level3" data-number="26.5.1">
+<h3 data-number="26.5.1" class="anchored" data-anchor-id="clustering-dendrograms-and-intuition"><span class="header-section-number">26.5.1</span> Clustering, Dendrograms, and Intuition</h3>
+Agglomerative clustering is one form of “hierarchical clustering.” It is interpretable because we can keep track of when two clusters got merged (each cluster is a tree), and we can visualize the merging hierarchy, resulting in a “dendrogram.” Won’t discuss this any further for this course, but you might see these in the wild. Here are some examples:
+<p float="left">
+<img src="images/dendro_1.png" alt="dendro_1" width="300"> <img src="images/dendro_2.png" alt="dendro_2" width="300">
+</p>
+<p>Some professors use agglomerative clustering for grading bins; if there is a big gap between two people, draw a grading threshold there. The idea is that grade clustering should be more like the figure below on the left, not the right.</p>
+<center>
+<img src="images/grading.png" alt="grading" width="600">
+</center>
+</section>
+</section>
+<section id="picking-k" class="level2" data-number="26.6">
+<h2 data-number="26.6" class="anchored" data-anchor-id="picking-k"><span class="header-section-number">26.6</span> Picking K</h2>
+<p>The algorithms we’ve discussed require us to pick a <span class="math inline">\(K\)</span> before we start. But how do we pick <span class="math inline">\(K\)</span>? Often, the best <span class="math inline">\(K\)</span> is subjective. For example, consider the state plot below.</p>
+<center>
+<img src="images/states.png" alt="states" width="600">
+</center>
+<p>How many clusters are there here? For K-Means, one approach to determine this is to plot inertia versus many different <span class="math inline">\(K\)</span> values. We’d pick the <span class="math inline">\(K\)</span> in the <strong>elbow</strong>, where we get diminishing returns afterward. Note that big, complicated data often lacks an elbow, so this method is not foolproof. Here, we would likely select <span class="math inline">\(K = 2\)</span>.</p>
+<center>
+<img src="images/elbow.png" alt="elbow" width="500">
+</center>
+<section id="silhouette-scores" class="level3" data-number="26.6.1">
+<h3 data-number="26.6.1" class="anchored" data-anchor-id="silhouette-scores"><span class="header-section-number">26.6.1</span> Silhouette Scores</h3>
+<p>To evaluate how “well-clustered” a specific data point is, we can use the <strong>silhouette score</strong>, also termed the <strong>silhouette width</strong>. A high silhouette score indicates that a point is near the other points in its cluster; a low score means that it’s far from the other points in its cluster.</p>
+<center>
+<img src="images/high_low.png" alt="high_low" width="600">
+</center>
+<p>For a data point <span class="math inline">\(X\)</span>, score <span class="math inline">\(S\)</span> is: <span class="math display">\[S =\frac{B - A}{\max(A, B)}\]</span> where <span class="math inline">\(A\)</span> is the average distance to <em>other</em> points in the cluster, and <span class="math inline">\(B\)</span> is the average distance to points in the <em>closest</em> cluster.</p>
+<p>Consider what the highest possible value of <span class="math inline">\(S\)</span> is and how that value can occur. The highest possible value of <span class="math inline">\(S\)</span> is 1, which happens if every point in <span class="math inline">\(X\)</span>’s cluster is right on top of <span class="math inline">\(X\)</span>; the average distance to other points in <span class="math inline">\(X\)</span>’s cluster is <span class="math inline">\(0\)</span>, so <span class="math inline">\(A = 0\)</span>. Thus, <span class="math inline">\(S = \frac{B}{\max(0, B)} = \frac{B}{B} = 1\)</span>. Another case where <span class="math inline">\(S = 1\)</span> could happen is if <span class="math inline">\(B\)</span> is <em>much</em> greater than <span class="math inline">\(A\)</span> (we denote this as <span class="math inline">\(B &gt;&gt; A\)</span>).</p>
+<p>Can <span class="math inline">\(S\)</span> be negative? The answer is yes. If the average distance to X’s clustermates is larger than the distance to the closest cluster, then this is possible. For example, the “low score” point on the right of the image above has <span class="math inline">\(S = -0.13\)</span>.</p>
+</section>
+<section id="silhouette-plot" class="level3" data-number="26.6.2">
+<h3 data-number="26.6.2" class="anchored" data-anchor-id="silhouette-plot"><span class="header-section-number">26.6.2</span> Silhouette Plot</h3>
+<p>We can plot the <strong>silhouette scores</strong> for all of our datapoints. The x-axis represents the silhouette coefficient value or silhouette score. The y-axis tells us which cluster label the points belong to, as well as the number of points within a particular cluster. Points with large silhouette widths are deeply embedded in their cluster; the red dotted line shows the average. Below, we plot the silhouette score for our plot with <span class="math inline">\(K=2\)</span>.</p>
+<p float="left">
+<img src="images/high_low.png" alt="dendro_1" width="350"> <img src="images/silhouette_2.png" alt="dendro_2" width="350">
+</p>
+<p>Similarly, we can plot the silhouette score for the same dataset but with <span class="math inline">\(K=3\)</span>:</p>
+<p float="left">
+<img src="images/cluster_3.png" alt="dendro_1" width="350"> <img src="images/silhouette_scores.png" alt="dendro_2" width="350">
+</p>
+<p>The average silhouette score is lower with 3 clusters, so <span class="math inline">\(K=2\)</span> is a better choice. This aligns with our visual intuition as well.</p>
+</section>
+<section id="picking-k-real-world-metrics" class="level3" data-number="26.6.3">
+<h3 data-number="26.6.3" class="anchored" data-anchor-id="picking-k-real-world-metrics"><span class="header-section-number">26.6.3</span> Picking K: Real World Metrics</h3>
+<p>Sometimes you can rely on real-world metrics to guide your choice of <span class="math inline">\(K\)</span>. For t-shirts, we can either:</p>
+<ul>
+<li>Cluster heights and weights of customers with <span class="math inline">\(K = 3\)</span> to design Small, Medium, and Large shirts</li>
+<li>Cluster heights and weights of customers with <span class="math inline">\(K = 5\)</span> to design XS, S, M, L, and XL shirts</li>
+</ul>
+<p>To choose <span class="math inline">\(K\)</span>, consider projected costs and sales for the 2 different <span class="math inline">\(K\)</span>s and select the one that maximizes profit.</p>
+</section>
+</section>
+<section id="conclusion" class="level2" data-number="26.7">
+<h2 data-number="26.7" class="anchored" data-anchor-id="conclusion"><span class="header-section-number">26.7</span> Conclusion</h2>
+<p>We’ve now discussed a new machine learning goal —— clustering —— and explored two solutions:</p>
+<ul>
+<li>K-Means Clustering tries to optimize a loss function called inertia (no known algorithm to find the optimal answer in an efficient manner)</li>
+<li>Hierarchical Agglomerative Clustering builds clusters bottom-up by merging clusters “close” to each other, depending on the choice of linkage.</li>
+</ul>
+<p>Our version of these algorithms required a hyperparameter <span class="math inline">\(K\)</span>. There are 4 ways to pick <span class="math inline">\(K\)</span>: the elbow method, silhouette scores, and by harnessing real-world metrics.</p>
+<p>There are many machine learning problems. Each can be addressed by many different solution techniques. Each has many metrics for evaluating success / loss. Many techniques can be used to solve different problem types. For example, linear models can be used for regression and classification.</p>
+<p>We’ve only scratched the surface and haven’t discussed many important ideas, such as neural networks and deep learning. In the last lecture, we’ll provide some specific course recommendations on how to explore these topics further.</p>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../pca_2/pca_2.html" class="pagination-link" aria-label="PCA II">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/clustering/images/agg1.png b/docs/clustering/images/agg1.png
new file mode 100644
index 000000000..d408c6745
Binary files /dev/null and b/docs/clustering/images/agg1.png differ
diff --git a/docs/clustering/images/agg2.png b/docs/clustering/images/agg2.png
new file mode 100644
index 000000000..f4c4b1f15
Binary files /dev/null and b/docs/clustering/images/agg2.png differ
diff --git a/docs/clustering/images/agg3.png b/docs/clustering/images/agg3.png
new file mode 100644
index 000000000..34c3a916f
Binary files /dev/null and b/docs/clustering/images/agg3.png differ
diff --git a/docs/clustering/images/agg4.png b/docs/clustering/images/agg4.png
new file mode 100644
index 000000000..b29437491
Binary files /dev/null and b/docs/clustering/images/agg4.png differ
diff --git a/docs/clustering/images/agg5.png b/docs/clustering/images/agg5.png
new file mode 100644
index 000000000..cef3813ba
Binary files /dev/null and b/docs/clustering/images/agg5.png differ
diff --git a/docs/clustering/images/agg6.png b/docs/clustering/images/agg6.png
new file mode 100644
index 000000000..e6bd3539c
Binary files /dev/null and b/docs/clustering/images/agg6.png differ
diff --git a/docs/clustering/images/beforeandafter4.png b/docs/clustering/images/beforeandafter4.png
new file mode 100644
index 000000000..fbd332a57
Binary files /dev/null and b/docs/clustering/images/beforeandafter4.png differ
diff --git a/docs/clustering/images/blobs.png b/docs/clustering/images/blobs.png
new file mode 100644
index 000000000..1fbc92303
Binary files /dev/null and b/docs/clustering/images/blobs.png differ
diff --git a/docs/clustering/images/cluster_3.png b/docs/clustering/images/cluster_3.png
new file mode 100644
index 000000000..2b07225c6
Binary files /dev/null and b/docs/clustering/images/cluster_3.png differ
diff --git a/docs/clustering/images/cluster_class.png b/docs/clustering/images/cluster_class.png
new file mode 100644
index 000000000..abcee0170
Binary files /dev/null and b/docs/clustering/images/cluster_class.png differ
diff --git a/docs/clustering/images/cluster_iter1.png b/docs/clustering/images/cluster_iter1.png
new file mode 100644
index 000000000..850d774a9
Binary files /dev/null and b/docs/clustering/images/cluster_iter1.png differ
diff --git a/docs/clustering/images/cluster_iter5.png b/docs/clustering/images/cluster_iter5.png
new file mode 100644
index 000000000..4da97d5ac
Binary files /dev/null and b/docs/clustering/images/cluster_iter5.png differ
diff --git a/docs/clustering/images/clustering_comparison.png b/docs/clustering/images/clustering_comparison.png
new file mode 100644
index 000000000..2f824a073
Binary files /dev/null and b/docs/clustering/images/clustering_comparison.png differ
diff --git a/docs/clustering/images/clusters_ex1.png b/docs/clustering/images/clusters_ex1.png
new file mode 100644
index 000000000..b5579846b
Binary files /dev/null and b/docs/clustering/images/clusters_ex1.png differ
diff --git a/docs/clustering/images/components.png b/docs/clustering/images/components.png
new file mode 100644
index 000000000..0ee3ae35f
Binary files /dev/null and b/docs/clustering/images/components.png differ
diff --git a/docs/clustering/images/dendro_1.png b/docs/clustering/images/dendro_1.png
new file mode 100644
index 000000000..6daabecab
Binary files /dev/null and b/docs/clustering/images/dendro_1.png differ
diff --git a/docs/clustering/images/dendro_2.png b/docs/clustering/images/dendro_2.png
new file mode 100644
index 000000000..8c9ef8089
Binary files /dev/null and b/docs/clustering/images/dendro_2.png differ
diff --git a/docs/clustering/images/elbow.png b/docs/clustering/images/elbow.png
new file mode 100644
index 000000000..a73b38667
Binary files /dev/null and b/docs/clustering/images/elbow.png differ
diff --git a/docs/clustering/images/four_cluster.png b/docs/clustering/images/four_cluster.png
new file mode 100644
index 000000000..adff35d06
Binary files /dev/null and b/docs/clustering/images/four_cluster.png differ
diff --git a/docs/clustering/images/genes.png b/docs/clustering/images/genes.png
new file mode 100644
index 000000000..f948b6009
Binary files /dev/null and b/docs/clustering/images/genes.png differ
diff --git a/docs/clustering/images/grading.png b/docs/clustering/images/grading.png
new file mode 100644
index 000000000..1eab30ae1
Binary files /dev/null and b/docs/clustering/images/grading.png differ
diff --git a/docs/clustering/images/hierarchical_approach.png b/docs/clustering/images/hierarchical_approach.png
new file mode 100644
index 000000000..f0c915a4f
Binary files /dev/null and b/docs/clustering/images/hierarchical_approach.png differ
diff --git a/docs/clustering/images/high_low.png b/docs/clustering/images/high_low.png
new file mode 100644
index 000000000..346d62f14
Binary files /dev/null and b/docs/clustering/images/high_low.png differ
diff --git a/docs/clustering/images/inertia_distortion.png b/docs/clustering/images/inertia_distortion.png
new file mode 100644
index 000000000..9d0b750d0
Binary files /dev/null and b/docs/clustering/images/inertia_distortion.png differ
diff --git a/docs/clustering/images/init_cluster.png b/docs/clustering/images/init_cluster.png
new file mode 100644
index 000000000..abf040b97
Binary files /dev/null and b/docs/clustering/images/init_cluster.png differ
diff --git a/docs/clustering/images/linkage.png b/docs/clustering/images/linkage.png
new file mode 100644
index 000000000..a0e972586
Binary files /dev/null and b/docs/clustering/images/linkage.png differ
diff --git a/docs/clustering/images/ml_taxonomy.png b/docs/clustering/images/ml_taxonomy.png
new file mode 100644
index 000000000..bbcaf3a2d
Binary files /dev/null and b/docs/clustering/images/ml_taxonomy.png differ
diff --git a/docs/clustering/images/outline-ex3.png b/docs/clustering/images/outline-ex3.png
new file mode 100644
index 000000000..06e914d40
Binary files /dev/null and b/docs/clustering/images/outline-ex3.png differ
diff --git a/docs/clustering/images/random_outputs.png b/docs/clustering/images/random_outputs.png
new file mode 100644
index 000000000..0d133b46b
Binary files /dev/null and b/docs/clustering/images/random_outputs.png differ
diff --git a/docs/clustering/images/silhouette_2.png b/docs/clustering/images/silhouette_2.png
new file mode 100644
index 000000000..3aa2d6bab
Binary files /dev/null and b/docs/clustering/images/silhouette_2.png differ
diff --git a/docs/clustering/images/silhouette_scores.png b/docs/clustering/images/silhouette_scores.png
new file mode 100644
index 000000000..fc54a4d5d
Binary files /dev/null and b/docs/clustering/images/silhouette_scores.png differ
diff --git a/docs/clustering/images/states.png b/docs/clustering/images/states.png
new file mode 100644
index 000000000..20e60c9e9
Binary files /dev/null and b/docs/clustering/images/states.png differ
diff --git a/docs/clustering/images/taxonomy.png b/docs/clustering/images/taxonomy.png
new file mode 100644
index 000000000..9f0ae5903
Binary files /dev/null and b/docs/clustering/images/taxonomy.png differ
diff --git a/docs/constant_model_loss_transformations/images/bulge.png b/docs/constant_model_loss_transformations/images/bulge.png
new file mode 100644
index 000000000..aee1d745e
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/bulge.png differ
diff --git a/docs/constant_model_loss_transformations/images/constant_loss_surface.png b/docs/constant_model_loss_transformations/images/constant_loss_surface.png
new file mode 100644
index 000000000..1cd733bd8
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/constant_loss_surface.png differ
diff --git a/docs/constant_model_loss_transformations/images/dugong_rug.png b/docs/constant_model_loss_transformations/images/dugong_rug.png
new file mode 100644
index 000000000..9c5e9df67
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/dugong_rug.png differ
diff --git a/docs/constant_model_loss_transformations/images/dugong_scatter.png b/docs/constant_model_loss_transformations/images/dugong_scatter.png
new file mode 100644
index 000000000..4bf3a8b06
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/dugong_scatter.png differ
diff --git a/docs/constant_model_loss_transformations/images/error.png b/docs/constant_model_loss_transformations/images/error.png
new file mode 100644
index 000000000..f37677abb
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/error.png differ
diff --git a/docs/constant_model_loss_transformations/images/mae_loss_infinite.png b/docs/constant_model_loss_transformations/images/mae_loss_infinite.png
new file mode 100644
index 000000000..2bd5e9e07
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/mae_loss_infinite.png differ
diff --git a/docs/constant_model_loss_transformations/images/mse_loss_26.png b/docs/constant_model_loss_transformations/images/mse_loss_26.png
new file mode 100644
index 000000000..7c39cc767
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/mse_loss_26.png differ
diff --git a/docs/constant_model_loss_transformations/images/outliers.png b/docs/constant_model_loss_transformations/images/outliers.png
new file mode 100644
index 000000000..61f295ddb
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/outliers.png differ
diff --git a/docs/constant_model_loss_transformations/images/slr_loss_surface.png b/docs/constant_model_loss_transformations/images/slr_loss_surface.png
new file mode 100644
index 000000000..66320e5d9
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/slr_loss_surface.png differ
diff --git a/docs/constant_model_loss_transformations/images/slr_modeling.png b/docs/constant_model_loss_transformations/images/slr_modeling.png
new file mode 100644
index 000000000..c51158f5f
Binary files /dev/null and b/docs/constant_model_loss_transformations/images/slr_modeling.png differ
diff --git a/docs/constant_model_loss_transformations/loss_transformations.html b/docs/constant_model_loss_transformations/loss_transformations.html
new file mode 100644
index 000000000..27be73416
--- /dev/null
+++ b/docs/constant_model_loss_transformations/loss_transformations.html
@@ -0,0 +1,2620 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>11&nbsp; Constant Model, Loss, and Transformations – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../ols/ols.html" rel="next">
+<link href="../intro_to_modeling/intro_to_modeling.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../constant_model_loss_transformations/loss_transformations.html"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Constant Model, Loss, and Transformations</h2>
+   
+  <ul>
+  <li><a href="#prediction-vs.-estimation" id="toc-prediction-vs.-estimation" class="nav-link active" data-scroll-target="#prediction-vs.-estimation"><span class="header-section-number">11.0.1</span> Prediction vs.&nbsp;Estimation</a></li>
+  <li><a href="#step-4-evaluating-the-slr-model" id="toc-step-4-evaluating-the-slr-model" class="nav-link" data-scroll-target="#step-4-evaluating-the-slr-model"><span class="header-section-number">11.1</span> Step 4: Evaluating the SLR Model</a>
+  <ul>
+  <li><a href="#four-mysterious-datasets-anscombes-quartet" id="toc-four-mysterious-datasets-anscombes-quartet" class="nav-link" data-scroll-target="#four-mysterious-datasets-anscombes-quartet"><span class="header-section-number">11.1.1</span> Four Mysterious Datasets (Anscombe’s quartet)</a></li>
+  </ul></li>
+  <li><a href="#constant-model-mse" id="toc-constant-model-mse" class="nav-link" data-scroll-target="#constant-model-mse"><span class="header-section-number">11.2</span> Constant Model + MSE</a>
+  <ul>
+  <li><a href="#deriving-the-optimal-theta_0" id="toc-deriving-the-optimal-theta_0" class="nav-link" data-scroll-target="#deriving-the-optimal-theta_0"><span class="header-section-number">11.2.1</span> Deriving the optimal <span class="math inline">\(\theta_0\)</span></a></li>
+  <li><a href="#comparing-two-different-models-both-fit-with-mse" id="toc-comparing-two-different-models-both-fit-with-mse" class="nav-link" data-scroll-target="#comparing-two-different-models-both-fit-with-mse"><span class="header-section-number">11.2.2</span> Comparing Two Different Models, Both Fit with MSE</a></li>
+  </ul></li>
+  <li><a href="#constant-model-mae" id="toc-constant-model-mae" class="nav-link" data-scroll-target="#constant-model-mae"><span class="header-section-number">11.3</span> Constant Model + MAE</a>
+  <ul>
+  <li><a href="#deriving-the-optimal-theta_0-1" id="toc-deriving-the-optimal-theta_0-1" class="nav-link" data-scroll-target="#deriving-the-optimal-theta_0-1"><span class="header-section-number">11.3.1</span> Deriving the optimal <span class="math inline">\(\theta_0\)</span></a></li>
+  </ul></li>
+  <li><a href="#summary-loss-optimization-calculus-and-critical-points" id="toc-summary-loss-optimization-calculus-and-critical-points" class="nav-link" data-scroll-target="#summary-loss-optimization-calculus-and-critical-points"><span class="header-section-number">11.4</span> Summary: Loss Optimization, Calculus, and Critical Points</a></li>
+  <li><a href="#comparing-loss-functions" id="toc-comparing-loss-functions" class="nav-link" data-scroll-target="#comparing-loss-functions"><span class="header-section-number">11.5</span> Comparing Loss Functions</a></li>
+  <li><a href="#transformations-to-fit-linear-models" id="toc-transformations-to-fit-linear-models" class="nav-link" data-scroll-target="#transformations-to-fit-linear-models"><span class="header-section-number">11.6</span> Transformations to fit Linear Models</a></li>
+  <li><a href="#multiple-linear-regression" id="toc-multiple-linear-regression" class="nav-link" data-scroll-target="#multiple-linear-regression"><span class="header-section-number">11.7</span> Multiple Linear Regression</a></li>
+  <li><a href="#bonus-calculating-constant-model-mse-using-an-algebraic-trick" id="toc-bonus-calculating-constant-model-mse-using-an-algebraic-trick" class="nav-link" data-scroll-target="#bonus-calculating-constant-model-mse-using-an-algebraic-trick"><span class="header-section-number">11.8</span> Bonus: Calculating Constant Model MSE Using an Algebraic Trick</a>
+  <ul>
+  <li><a href="#note" id="toc-note" class="nav-link" data-scroll-target="#note"><span class="header-section-number">11.8.0.0.1</span> Note</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Derive the optimal model parameters for the constant model under MSE and MAE cost functions.</li>
+<li>Evaluate the differences between MSE and MAE risk.</li>
+<li>Understand the need for linearization of variables and apply the Tukey-Mosteller bulge diagram for transformations.</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Last time, we introduced the modeling process. We set up a framework to predict target variables as functions of our features, following a set workflow:</p>
+<ol type="1">
+<li>Choose a model - how should we represent the world?</li>
+<li>Choose a loss function - how do we quantify prediction error?</li>
+<li>Fit the model - how do we choose the best parameter of our model given our data?</li>
+<li>Evaluate model performance - how do we evaluate whether this process gave rise to a good model?</li>
+</ol>
+<p>To illustrate this process, we derived the optimal model parameters under simple linear regression (SLR) with mean squared error (MSE) as the cost function. A summary of the SLR modeling process is shown below:</p>
+<div data-align="middle">
+<p><img src="images/slr_modeling.png" alt="modeling" width="600"></p>
+</div>
+<p>In this lecture, we’ll dive deeper into step 4 - evaluating model performance - using SLR as an example. Additionally, we’ll also explore the modeling process with new models, continue familiarizing ourselves with the modeling process by finding the best model parameters under a new model, the constant model, and test out two different loss functions to understand how our choice of loss influences model design. Later on, we’ll consider what happens when a linear model isn’t the best choice to capture trends in our data and what solutions there are to create better models.</p>
+<p>Before we get into Step 4, let’s quickly review some important terminology.</p>
+<section id="prediction-vs.-estimation" class="level3" data-number="11.0.1">
+<h3 data-number="11.0.1" class="anchored" data-anchor-id="prediction-vs.-estimation"><span class="header-section-number">11.0.1</span> Prediction vs.&nbsp;Estimation</h3>
+<p>The terms prediction and estimation are often used somewhat interchangeably, but there is a subtle difference between them. <strong>Estimation</strong> is the task of using data to calculate model parameters. <strong>Prediction</strong> is the task of using a model to predict outputs for unseen data. In our simple linear regression model,</p>
+<p><span class="math display">\[\hat{y} = \hat{\theta_0} + \hat{\theta_1}\]</span></p>
+<p>we <strong>estimate</strong> the parameters by minimizing average loss; then, we <strong>predict</strong> using these estimations. <strong>Least Squares Estimation</strong> is when we choose the parameters that minimize MSE.</p>
+</section>
+<section id="step-4-evaluating-the-slr-model" class="level2" data-number="11.1">
+<h2 data-number="11.1" class="anchored" data-anchor-id="step-4-evaluating-the-slr-model"><span class="header-section-number">11.1</span> Step 4: Evaluating the SLR Model</h2>
+<p>Now that we’ve explored the mathematics behind (1) choosing a model, (2) choosing a loss function, and (3) fitting the model, we’re left with one final question – how “good” are the predictions made by this “best” fitted model? To determine this, we can:</p>
+<ol type="1">
+<li><p>Visualize data and compute statistics:</p>
+<ul>
+<li>Plot the original data.</li>
+<li>Compute each column’s mean and standard deviation. If the mean and standard deviation of our predictions are close to those of the original observed <span class="math inline">\(y_i\)</span>’s, we might be inclined to say that our model has done well.</li>
+<li>(If we’re fitting a linear model) Compute the correlation <span class="math inline">\(r\)</span>. A large magnitude for the correlation coefficient between the feature and response variables could also indicate that our model has done well.</li>
+</ul></li>
+<li><p>Performance metrics:</p>
+<ul>
+<li>We can take the <strong>Root Mean Squared Error (RMSE)</strong>.
+<ul>
+<li>It’s the square root of the mean squared error (MSE), which is the average loss that we’ve been minimizing to determine optimal model parameters.</li>
+<li>RMSE is in the same units as <span class="math inline">\(y\)</span>.</li>
+<li>A lower RMSE indicates more “accurate” predictions, as we have a lower “average loss” across the data.</li>
+</ul></li>
+</ul>
+<p><span class="math display">\[\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2}\]</span></p></li>
+<li><p>Visualization:</p>
+<ul>
+<li>Look at the residual plot of <span class="math inline">\(e_i = y_i - \hat{y_i}\)</span> to visualize the difference between actual and predicted values. The good residual plot should not show any pattern between input/features <span class="math inline">\(x_i\)</span> and residual values <span class="math inline">\(e_i\)</span>.</li>
+</ul></li>
+</ol>
+<p>To illustrate this process, let’s take a look at <strong>Anscombe’s quartet</strong>.</p>
+<section id="four-mysterious-datasets-anscombes-quartet" class="level3" data-number="11.1.1">
+<h3 data-number="11.1.1" class="anchored" data-anchor-id="four-mysterious-datasets-anscombes-quartet"><span class="header-section-number">11.1.1</span> Four Mysterious Datasets (Anscombe’s quartet)</h3>
+<p>Let’s take a look at four different datasets.</p>
+<div id="a211980b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> itertools</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> mpl_toolkits.mplot3d <span class="im">import</span> Axes3D</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="87d5ba92" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Big font helper</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> adjust_fontsize(size<span class="op">=</span><span class="va">None</span>):</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    SMALL_SIZE <span class="op">=</span> <span class="dv">8</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>    MEDIUM_SIZE <span class="op">=</span> <span class="dv">10</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>    BIGGER_SIZE <span class="op">=</span> <span class="dv">12</span></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> size <span class="op">!=</span> <span class="va">None</span>:</span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>        SMALL_SIZE <span class="op">=</span> MEDIUM_SIZE <span class="op">=</span> BIGGER_SIZE <span class="op">=</span> size</span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"font"</span>, size<span class="op">=</span>SMALL_SIZE)  <span class="co"># controls default text sizes</span></span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"axes"</span>, titlesize<span class="op">=</span>SMALL_SIZE)  <span class="co"># fontsize of the axes title</span></span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"axes"</span>, labelsize<span class="op">=</span>MEDIUM_SIZE)  <span class="co"># fontsize of the x and y labels</span></span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"xtick"</span>, labelsize<span class="op">=</span>SMALL_SIZE)  <span class="co"># fontsize of the tick labels</span></span>
+<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"ytick"</span>, labelsize<span class="op">=</span>SMALL_SIZE)  <span class="co"># fontsize of the tick labels</span></span>
+<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"legend"</span>, fontsize<span class="op">=</span>SMALL_SIZE)  <span class="co"># legend fontsize</span></span>
+<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"figure"</span>, titlesize<span class="op">=</span>BIGGER_SIZE)  <span class="co"># fontsize of the figure title</span></span>
+<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a><span class="co"># Helper functions</span></span>
+<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> standard_units(x):</span>
+<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (x <span class="op">-</span> np.mean(x)) <span class="op">/</span> np.std(x)</span>
+<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> correlation(x, y):</span>
+<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(standard_units(x) <span class="op">*</span> standard_units(y))</span>
+<span id="cb2-25"><a href="#cb2-25" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-26"><a href="#cb2-26" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-27"><a href="#cb2-27" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> slope(x, y):</span>
+<span id="cb2-28"><a href="#cb2-28" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> correlation(x, y) <span class="op">*</span> np.std(y) <span class="op">/</span> np.std(x)</span>
+<span id="cb2-29"><a href="#cb2-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-30"><a href="#cb2-30" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-31"><a href="#cb2-31" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> intercept(x, y):</span>
+<span id="cb2-32"><a href="#cb2-32" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(y) <span class="op">-</span> slope(x, y) <span class="op">*</span> np.mean(x)</span>
+<span id="cb2-33"><a href="#cb2-33" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-34"><a href="#cb2-34" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-35"><a href="#cb2-35" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> fit_least_squares(x, y):</span>
+<span id="cb2-36"><a href="#cb2-36" aria-hidden="true" tabindex="-1"></a>    theta_0 <span class="op">=</span> intercept(x, y)</span>
+<span id="cb2-37"><a href="#cb2-37" aria-hidden="true" tabindex="-1"></a>    theta_1 <span class="op">=</span> slope(x, y)</span>
+<span id="cb2-38"><a href="#cb2-38" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> theta_0, theta_1</span>
+<span id="cb2-39"><a href="#cb2-39" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-40"><a href="#cb2-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-41"><a href="#cb2-41" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> predict(x, theta_0, theta_1):</span>
+<span id="cb2-42"><a href="#cb2-42" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> theta_0 <span class="op">+</span> theta_1 <span class="op">*</span> x</span>
+<span id="cb2-43"><a href="#cb2-43" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-44"><a href="#cb2-44" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-45"><a href="#cb2-45" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> compute_mse(y, yhat):</span>
+<span id="cb2-46"><a href="#cb2-46" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean((y <span class="op">-</span> yhat) <span class="op">**</span> <span class="dv">2</span>)</span>
+<span id="cb2-47"><a href="#cb2-47" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-48"><a href="#cb2-48" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-49"><a href="#cb2-49" aria-hidden="true" tabindex="-1"></a>plt.style.use(<span class="st">"default"</span>)  <span class="co"># Revert style to default mpl</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="455dd6c9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>plt.style.use(<span class="st">"default"</span>)  <span class="co"># Revert style to default mpl</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>NO_VIZ, RESID, RESID_SCATTER <span class="op">=</span> <span class="bu">range</span>(<span class="dv">3</span>)</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> least_squares_evaluation(x, y, visualize<span class="op">=</span>NO_VIZ):</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># statistics</span></span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"x_mean : </span><span class="sc">{</span>np<span class="sc">.</span>mean(x)<span class="sc">:.2f}</span><span class="ss">, y_mean : </span><span class="sc">{</span>np<span class="sc">.</span>mean(y)<span class="sc">:.2f}</span><span class="ss">"</span>)</span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"x_stdev: </span><span class="sc">{</span>np<span class="sc">.</span>std(x)<span class="sc">:.2f}</span><span class="ss">, y_stdev: </span><span class="sc">{</span>np<span class="sc">.</span>std(y)<span class="sc">:.2f}</span><span class="ss">"</span>)</span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"r = Correlation(x, y): </span><span class="sc">{</span>correlation(x, y)<span class="sc">:.3f}</span><span class="ss">"</span>)</span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Performance metrics</span></span>
+<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>    ahat, bhat <span class="op">=</span> fit_least_squares(x, y)</span>
+<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>    yhat <span class="op">=</span> predict(x, ahat, bhat)</span>
+<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"</span><span class="ch">\t</span><span class="ss">heta_0: </span><span class="sc">{</span>ahat<span class="sc">:.2f}</span><span class="ss">, </span><span class="ch">\t</span><span class="ss">heta_1: </span><span class="sc">{</span>bhat<span class="sc">:.2f}</span><span class="ss">"</span>)</span>
+<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"RMSE: </span><span class="sc">{</span>np<span class="sc">.</span>sqrt(compute_mse(y, yhat))<span class="sc">:.3f}</span><span class="ss">"</span>)</span>
+<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>    <span class="co"># visualization</span></span>
+<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a>    fig, ax_resid <span class="op">=</span> <span class="va">None</span>, <span class="va">None</span></span>
+<span id="cb3-19"><a href="#cb3-19" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> visualize <span class="op">==</span> RESID_SCATTER:</span>
+<span id="cb3-20"><a href="#cb3-20" aria-hidden="true" tabindex="-1"></a>        fig, axs <span class="op">=</span> plt.subplots(<span class="dv">1</span>, <span class="dv">2</span>, figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="dv">3</span>))</span>
+<span id="cb3-21"><a href="#cb3-21" aria-hidden="true" tabindex="-1"></a>        axs[<span class="dv">0</span>].scatter(x, y)</span>
+<span id="cb3-22"><a href="#cb3-22" aria-hidden="true" tabindex="-1"></a>        axs[<span class="dv">0</span>].plot(x, yhat)</span>
+<span id="cb3-23"><a href="#cb3-23" aria-hidden="true" tabindex="-1"></a>        axs[<span class="dv">0</span>].set_title(<span class="st">"LS fit"</span>)</span>
+<span id="cb3-24"><a href="#cb3-24" aria-hidden="true" tabindex="-1"></a>        ax_resid <span class="op">=</span> axs[<span class="dv">1</span>]</span>
+<span id="cb3-25"><a href="#cb3-25" aria-hidden="true" tabindex="-1"></a>    <span class="cf">elif</span> visualize <span class="op">==</span> RESID:</span>
+<span id="cb3-26"><a href="#cb3-26" aria-hidden="true" tabindex="-1"></a>        fig <span class="op">=</span> plt.figure(figsize<span class="op">=</span>(<span class="dv">4</span>, <span class="dv">3</span>))</span>
+<span id="cb3-27"><a href="#cb3-27" aria-hidden="true" tabindex="-1"></a>        ax_resid <span class="op">=</span> plt.gca()</span>
+<span id="cb3-28"><a href="#cb3-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-29"><a href="#cb3-29" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> ax_resid <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
+<span id="cb3-30"><a href="#cb3-30" aria-hidden="true" tabindex="-1"></a>        ax_resid.scatter(x, y <span class="op">-</span> yhat, color<span class="op">=</span><span class="st">"red"</span>)</span>
+<span id="cb3-31"><a href="#cb3-31" aria-hidden="true" tabindex="-1"></a>        ax_resid.plot([<span class="dv">4</span>, <span class="dv">14</span>], [<span class="dv">0</span>, <span class="dv">0</span>], color<span class="op">=</span><span class="st">"black"</span>)</span>
+<span id="cb3-32"><a href="#cb3-32" aria-hidden="true" tabindex="-1"></a>        ax_resid.set_title(<span class="st">"Residuals"</span>)</span>
+<span id="cb3-33"><a href="#cb3-33" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-34"><a href="#cb3-34" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> fig</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="2498d065" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Load in four different datasets: I, II, III, IV</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> [<span class="dv">10</span>, <span class="dv">8</span>, <span class="dv">13</span>, <span class="dv">9</span>, <span class="dv">11</span>, <span class="dv">14</span>, <span class="dv">6</span>, <span class="dv">4</span>, <span class="dv">12</span>, <span class="dv">7</span>, <span class="dv">5</span>]</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>y1 <span class="op">=</span> [<span class="fl">8.04</span>, <span class="fl">6.95</span>, <span class="fl">7.58</span>, <span class="fl">8.81</span>, <span class="fl">8.33</span>, <span class="fl">9.96</span>, <span class="fl">7.24</span>, <span class="fl">4.26</span>, <span class="fl">10.84</span>, <span class="fl">4.82</span>, <span class="fl">5.68</span>]</span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>y2 <span class="op">=</span> [<span class="fl">9.14</span>, <span class="fl">8.14</span>, <span class="fl">8.74</span>, <span class="fl">8.77</span>, <span class="fl">9.26</span>, <span class="fl">8.10</span>, <span class="fl">6.13</span>, <span class="fl">3.10</span>, <span class="fl">9.13</span>, <span class="fl">7.26</span>, <span class="fl">4.74</span>]</span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>y3 <span class="op">=</span> [<span class="fl">7.46</span>, <span class="fl">6.77</span>, <span class="fl">12.74</span>, <span class="fl">7.11</span>, <span class="fl">7.81</span>, <span class="fl">8.84</span>, <span class="fl">6.08</span>, <span class="fl">5.39</span>, <span class="fl">8.15</span>, <span class="fl">6.42</span>, <span class="fl">5.73</span>]</span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>x4 <span class="op">=</span> [<span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">19</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>]</span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>y4 <span class="op">=</span> [<span class="fl">6.58</span>, <span class="fl">5.76</span>, <span class="fl">7.71</span>, <span class="fl">8.84</span>, <span class="fl">8.47</span>, <span class="fl">7.04</span>, <span class="fl">5.25</span>, <span class="fl">12.50</span>, <span class="fl">5.56</span>, <span class="fl">7.91</span>, <span class="fl">6.89</span>]</span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>anscombe <span class="op">=</span> {</span>
+<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>    <span class="st">"I"</span>: pd.DataFrame(<span class="bu">list</span>(<span class="bu">zip</span>(x, y1)), columns<span class="op">=</span>[<span class="st">"x"</span>, <span class="st">"y"</span>]),</span>
+<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>    <span class="st">"II"</span>: pd.DataFrame(<span class="bu">list</span>(<span class="bu">zip</span>(x, y2)), columns<span class="op">=</span>[<span class="st">"x"</span>, <span class="st">"y"</span>]),</span>
+<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a>    <span class="st">"III"</span>: pd.DataFrame(<span class="bu">list</span>(<span class="bu">zip</span>(x, y3)), columns<span class="op">=</span>[<span class="st">"x"</span>, <span class="st">"y"</span>]),</span>
+<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>    <span class="st">"IV"</span>: pd.DataFrame(<span class="bu">list</span>(<span class="bu">zip</span>(x4, y4)), columns<span class="op">=</span>[<span class="st">"x"</span>, <span class="st">"y"</span>]),</span>
+<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>}</span>
+<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the scatter plot and line of best fit</span></span>
+<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>fig, axs <span class="op">=</span> plt.subplots(<span class="dv">2</span>, <span class="dv">2</span>, figsize<span class="op">=</span>(<span class="dv">10</span>, <span class="dv">10</span>))</span>
+<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i, dataset <span class="kw">in</span> <span class="bu">enumerate</span>([<span class="st">"I"</span>, <span class="st">"II"</span>, <span class="st">"III"</span>, <span class="st">"IV"</span>]):</span>
+<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>    ans <span class="op">=</span> anscombe[dataset]</span>
+<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>    x, y <span class="op">=</span> ans[<span class="st">"x"</span>], ans[<span class="st">"y"</span>]</span>
+<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a>    ahat, bhat <span class="op">=</span> fit_least_squares(x, y)</span>
+<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a>    yhat <span class="op">=</span> predict(x, ahat, bhat)</span>
+<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].scatter(x, y, alpha<span class="op">=</span><span class="fl">0.6</span>, color<span class="op">=</span><span class="st">"red"</span>)  <span class="co"># plot the x, y points</span></span>
+<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].plot(x, yhat)  <span class="co"># plot the line of best fit</span></span>
+<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_xlabel(<span class="ss">f"$x_</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">$"</span>)</span>
+<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_ylabel(<span class="ss">f"$y_</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">$"</span>)</span>
+<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_title(<span class="ss">f"Dataset </span><span class="sc">{</span>dataset<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="loss_transformations_files/figure-html/cell-5-output-1.png" width="841" height="854" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>While these four sets of datapoints look very different, they actually all have identical means <span class="math inline">\(\bar x\)</span>, <span class="math inline">\(\bar y\)</span>, standard deviations <span class="math inline">\(\sigma_x\)</span>, <span class="math inline">\(\sigma_y\)</span>, correlation <span class="math inline">\(r\)</span>, and RMSE! If we only look at these statistics, we would probably be inclined to say that these datasets are similar.</p>
+<div id="0c81d3dc" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dataset <span class="kw">in</span> [<span class="st">"I"</span>, <span class="st">"II"</span>, <span class="st">"III"</span>, <span class="st">"IV"</span>]:</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"&gt;&gt;&gt; Dataset </span><span class="sc">{</span>dataset<span class="sc">}</span><span class="ss">:"</span>)</span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>    ans <span class="op">=</span> anscombe[dataset]</span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>    fig <span class="op">=</span> least_squares_evaluation(ans[<span class="st">"x"</span>], ans[<span class="st">"y"</span>], visualize<span class="op">=</span>NO_VIZ)</span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>()</span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>&gt;&gt;&gt; Dataset I:
+x_mean : 9.00, y_mean : 7.50
+x_stdev: 3.16, y_stdev: 1.94
+r = Correlation(x, y): 0.816
+    heta_0: 3.00,   heta_1: 0.50
+RMSE: 1.119
+
+
+&gt;&gt;&gt; Dataset II:
+x_mean : 9.00, y_mean : 7.50
+x_stdev: 3.16, y_stdev: 1.94
+r = Correlation(x, y): 0.816
+    heta_0: 3.00,   heta_1: 0.50
+RMSE: 1.119
+
+
+&gt;&gt;&gt; Dataset III:
+x_mean : 9.00, y_mean : 7.50
+x_stdev: 3.16, y_stdev: 1.94
+r = Correlation(x, y): 0.816
+    heta_0: 3.00,   heta_1: 0.50
+RMSE: 1.118
+
+
+&gt;&gt;&gt; Dataset IV:
+x_mean : 9.00, y_mean : 7.50
+x_stdev: 3.16, y_stdev: 1.94
+r = Correlation(x, y): 0.817
+    heta_0: 3.00,   heta_1: 0.50
+RMSE: 1.118
+
+</code></pre>
+</div>
+</div>
+<p>We may also wish to visualize the model’s <strong>residuals</strong>, defined as the difference between the observed and predicted <span class="math inline">\(y_i\)</span> value (<span class="math inline">\(e_i = y_i - \hat{y}_i\)</span>). This gives a high-level view of how “off” each prediction is from the true observed value. Recall that you explored this concept in <a href="https://inferentialthinking.com/chapters/15/5/Visual_Diagnostics.html?highlight=heteroscedasticity#detecting-heteroscedasticity">Data 8</a>: a good regression fit should display no clear pattern in its plot of residuals. The residual plots for Anscombe’s quartet are displayed below. Note how only the first plot shows no clear pattern to the magnitude of residuals. This is an indication that SLR is not the best choice of model for the remaining three sets of points.</p>
+<!-- <img src="images/residual.png" alt='residual' width='600'> -->
+<div id="0cb67f44" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="6">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Residual visualization</span></span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>fig, axs <span class="op">=</span> plt.subplots(<span class="dv">2</span>, <span class="dv">2</span>, figsize<span class="op">=</span>(<span class="dv">10</span>, <span class="dv">10</span>))</span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i, dataset <span class="kw">in</span> <span class="bu">enumerate</span>([<span class="st">"I"</span>, <span class="st">"II"</span>, <span class="st">"III"</span>, <span class="st">"IV"</span>]):</span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>    ans <span class="op">=</span> anscombe[dataset]</span>
+<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>    x, y <span class="op">=</span> ans[<span class="st">"x"</span>], ans[<span class="st">"y"</span>]</span>
+<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>    ahat, bhat <span class="op">=</span> fit_least_squares(x, y)</span>
+<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>    yhat <span class="op">=</span> predict(x, ahat, bhat)</span>
+<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].scatter(</span>
+<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>        x, y <span class="op">-</span> yhat, alpha<span class="op">=</span><span class="fl">0.6</span>, color<span class="op">=</span><span class="st">"red"</span></span>
+<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>    )  <span class="co"># plot the x, y points</span></span>
+<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].plot(</span>
+<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a>        x, np.zeros_like(x), color<span class="op">=</span><span class="st">"black"</span></span>
+<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a>    )  <span class="co"># plot the residual line</span></span>
+<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_xlabel(<span class="ss">f"$x_</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">$"</span>)</span>
+<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_ylabel(<span class="ss">f"$e_</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">$"</span>)</span>
+<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_title(<span class="ss">f"Dataset </span><span class="sc">{</span>dataset<span class="sc">}</span><span class="ss"> Residuals"</span>)</span>
+<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="loss_transformations_files/figure-html/cell-7-output-1.png" width="858" height="854" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="constant-model-mse" class="level2" data-number="11.2">
+<h2 data-number="11.2" class="anchored" data-anchor-id="constant-model-mse"><span class="header-section-number">11.2</span> Constant Model + MSE</h2>
+<p>Now, we’ll shift from the SLR model to the <strong>constant model</strong>, also known as a summary statistic. The constant model is slightly different from the simple linear regression model we’ve explored previously. Rather than generating predictions from an inputted feature variable, the constant model always <em>predicts the same constant number</em>. This ignores any relationships between variables. For example, let’s say we want to predict the number of drinks a boba shop sells in a day. Boba tea sales likely depend on the time of year, the weather, how the customers feel, whether school is in session, etc., but the constant model ignores these factors in favor of a simpler model. In other words, the constant model employs a <strong>simplifying assumption</strong>.</p>
+<p>It is also a parametric, statistical model:</p>
+<p><span class="math display">\[\hat{y} = \theta_0\]</span></p>
+<p><span class="math inline">\(\theta_0\)</span> is the parameter of the constant model, just as <span class="math inline">\(\theta_0\)</span> and <span class="math inline">\(\theta_1\)</span> were the parameters in SLR. Since our parameter <span class="math inline">\(\theta_0\)</span> is 1-dimensional (<span class="math inline">\(\theta_0 \in \mathbb{R}\)</span>), we now have no input to our model and will always predict <span class="math inline">\(\hat{y} = \theta_0\)</span>.</p>
+<section id="deriving-the-optimal-theta_0" class="level3" data-number="11.2.1">
+<h3 data-number="11.2.1" class="anchored" data-anchor-id="deriving-the-optimal-theta_0"><span class="header-section-number">11.2.1</span> Deriving the optimal <span class="math inline">\(\theta_0\)</span></h3>
+<p>Our task now is to determine what value of <span class="math inline">\(\theta_0\)</span> best represents the optimal model – in other words, what number should we guess each time to have the lowest possible <strong>average loss</strong> on our data?</p>
+<p>Like before, we’ll use Mean Squared Error (MSE). Recall that the MSE is average squared loss (L2 loss) over the data <span class="math inline">\(D = \{y_1, y_2, ..., y_n\}\)</span>.</p>
+<p><span class="math display">\[\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \hat{y_i})^2 \]</span></p>
+<p>Our modeling process now looks like this:</p>
+<ol type="1">
+<li>Choose a model: constant model</li>
+<li>Choose a loss function: L2 loss</li>
+<li>Fit the model</li>
+<li>Evaluate model performance</li>
+</ol>
+<p>Given the <strong>constant model</strong> <span class="math inline">\(\hat{y} = \theta_0\)</span>, we can rewrite the MSE equation as</p>
+<p><span class="math display">\[\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2 \]</span></p>
+<p>We can fit <strong>the model</strong> by finding the optimal <span class="math inline">\(\hat{\theta_0}\)</span> that minimizes the MSE using a calculus approach.</p>
+<ol type="1">
+<li>Differentiate with respect to <span class="math inline">\(\theta_0\)</span>:</li>
+</ol>
+<p><span class="math display">\[
+\begin{align}
+\frac{d}{d\theta_0}\text{R}(\theta) &amp; = \frac{d}{d\theta_0}(\frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2)
+\\ &amp;= \frac{1}{n}\sum^{n}_{i=1} \frac{d}{d\theta_0}  (y_i - \theta_0)^2 \quad \quad \text{a derivative of sums is a sum of derivatives}
+\\ &amp;= \frac{1}{n}\sum^{n}_{i=1} 2 (y_i - \theta_0) (-1) \quad \quad \text{chain rule}
+\\ &amp;= {\frac{-2}{n}}\sum^{n}_{i=1} (y_i - \theta_0) \quad \quad \text{simply constants}
+\end{align}
+\]</span></p>
+<ol start="2" type="1">
+<li><p>Set the derivative equation equal to 0:</p>
+<p><span class="math display">\[
+0 = {\frac{-2}{n}}\sum^{n}_{i=1} (y_i - \hat{\theta_0})
+\]</span></p></li>
+<li><p>Solve for <span class="math inline">\(\hat{\theta_0}\)</span></p></li>
+</ol>
+<p><span class="math display">\[
+\begin{align}
+0 &amp;= {\frac{-2}{n}}\sum^{n}_{i=1} (y_i - \hat{\theta_0})
+\\ &amp;= \sum^{n}_{i=1} (y_i - \hat{\theta_0}) \quad \quad \text{divide both sides by} \frac{-2}{n}
+\\ &amp;= \left(\sum^{n}_{i=1} y_i\right) - \left(\sum^{n}_{i=1} \theta_0\right) \quad \quad \text{separate sums}
+\\ &amp;= \left(\sum^{n}_{i=1} y_i\right) - (n \cdot \hat{\theta_0}) \quad \quad  \text{c + c + … + c = nc}
+\\ n \cdot \hat{\theta_0} &amp;= \sum^{n}_{i=1} y_i
+\\ \hat{\theta_0} &amp;= \frac{1}{n} \sum^{n}_{i=1} y_i
+\\ \hat{\theta_0} &amp;= \bar{y}
+\end{align}
+\]</span></p>
+<p>Let’s take a moment to interpret this result. <span class="math inline">\(\hat{\theta_0} = \bar{y}\)</span> is the optimal parameter for constant model + MSE. It holds true regardless of what data sample you have, and it provides some formal reasoning as to why the mean is such a common summary statistic.</p>
+<p>Our optimal model parameter is the value of the parameter that minimizes the cost function. This minimum value of the cost function can be expressed:</p>
+<p><span class="math display">\[R(\hat{\theta_0}) = \min_{\theta_0} R(\theta_0)\]</span></p>
+<p>To restate the above in plain English: we are looking at the value of the cost function when it takes the best parameter as input. This optimal model parameter, <span class="math inline">\(\hat{\theta_0}\)</span>, is the value of <span class="math inline">\(\theta_0\)</span> that minimizes the cost <span class="math inline">\(R\)</span>.</p>
+<p>For modeling purposes, we care less about the minimum value of cost, <span class="math inline">\(R(\hat{\theta_0})\)</span>, and more about the <em>value of <span class="math inline">\(\theta\)</span></em> that results in this lowest average loss. In other words, we concern ourselves with finding the best parameter value such that:</p>
+<p><span class="math display">\[\hat{\theta} = \underset{\theta}{\operatorname{\arg\min}}\:R(\theta)\]</span></p>
+<p>That is, we want to find the <strong>arg</strong>ument <span class="math inline">\(\theta\)</span> that <strong>min</strong>imizes the cost function.</p>
+</section>
+<section id="comparing-two-different-models-both-fit-with-mse" class="level3" data-number="11.2.2">
+<h3 data-number="11.2.2" class="anchored" data-anchor-id="comparing-two-different-models-both-fit-with-mse"><span class="header-section-number">11.2.2</span> Comparing Two Different Models, Both Fit with MSE</h3>
+<p>Now that we’ve explored the constant model with an L2 loss, we can compare it to the SLR model that we learned last lecture. Consider the dataset below, which contains information about the ages and lengths of dugongs. Supposed we wanted to predict dugong ages:</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 13%">
+<col style="width: 38%">
+<col style="width: 47%">
+</colgroup>
+<thead>
+<tr class="header">
+<th></th>
+<th>Constant Model</th>
+<th>Simple Linear Regression</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>model</td>
+<td><span class="math inline">\(\hat{y} = \theta_0\)</span></td>
+<td><span class="math inline">\(\hat{y} = \theta_0 + \theta_1 x\)</span></td>
+</tr>
+<tr class="even">
+<td>data</td>
+<td>sample of ages <span class="math inline">\(D = \{y_1, y_2, ..., y_n\}\)</span></td>
+<td>sample of ages <span class="math inline">\(D = \{(x_1, y_1), (x_2, y_2), ..., (x_n, y_n)\}\)</span></td>
+</tr>
+<tr class="odd">
+<td>dimensions</td>
+<td><span class="math inline">\(\hat{\theta_0}\)</span> is 1-D</td>
+<td><span class="math inline">\(\hat{\theta} = [\hat{\theta_0}, \hat{\theta_1}]\)</span> is 2-D</td>
+</tr>
+<tr class="even">
+<td>loss surface</td>
+<td>2-D <img src="images/constant_loss_surface.png" class="img-fluid"></td>
+<td>3-D <img src="images/slr_loss_surface.png" class="img-fluid"></td>
+</tr>
+<tr class="odd">
+<td>loss model</td>
+<td><span class="math inline">\(\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2\)</span></td>
+<td><span class="math inline">\(\hat{R}(\theta_0, \theta_1) = \frac{1}{n}\sum^{n}_{i=1} (y_i - (\theta_0 + \theta_1 x))^2\)</span></td>
+</tr>
+<tr class="even">
+<td>RMSE</td>
+<td>7.72</td>
+<td>4.31</td>
+</tr>
+<tr class="odd">
+<td>predictions visualized</td>
+<td>rug plot <img src="images/dugong_rug.png" class="img-fluid"></td>
+<td>scatter plot <img src="images/dugong_scatter.png" class="img-fluid"></td>
+</tr>
+</tbody>
+</table>
+<p>(Notice how the points for our SLR scatter plot are visually not a great linear fit. We’ll come back to this).</p>
+<p>The code for generating the graphs and models is included below, but we won’t go over it in too much depth.</p>
+<div id="1f4d558c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="7">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>dugongs <span class="op">=</span> pd.read_csv(<span class="st">"data/dugongs.csv"</span>)</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>data_constant <span class="op">=</span> dugongs[<span class="st">"Age"</span>]</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>data_linear <span class="op">=</span> dugongs[[<span class="st">"Length"</span>, <span class="st">"Age"</span>]]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="85951a6c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="8">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Constant Model + MSE</span></span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>plt.style.use(<span class="st">'default'</span>) <span class="co"># Revert style to default mpl</span></span>
+<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>adjust_fontsize(size<span class="op">=</span><span class="dv">16</span>)</span>
+<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_constant(theta, data):</span>
+<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(np.array([(y_obs <span class="op">-</span> theta) <span class="op">**</span> <span class="dv">2</span> <span class="cf">for</span> y_obs <span class="kw">in</span> data]), axis<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">20</span>, <span class="dv">42</span>, <span class="dv">1000</span>)</span>
+<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a>l2_loss_thetas <span class="op">=</span> mse_constant(thetas, data_constant)</span>
+<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Plotting the loss surface</span></span>
+<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a>plt.plot(thetas, l2_loss_thetas)</span>
+<span id="cb9-14"><a href="#cb9-14" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r'$\theta_0$'</span>)</span>
+<span id="cb9-15"><a href="#cb9-15" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="vs">r'MSE'</span>)</span>
+<span id="cb9-16"><a href="#cb9-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-17"><a href="#cb9-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Optimal point</span></span>
+<span id="cb9-18"><a href="#cb9-18" aria-hidden="true" tabindex="-1"></a>thetahat <span class="op">=</span> np.mean(data_constant)</span>
+<span id="cb9-19"><a href="#cb9-19" aria-hidden="true" tabindex="-1"></a>plt.scatter([thetahat], [mse_constant(thetahat, data_constant)], s<span class="op">=</span><span class="dv">50</span>, label <span class="op">=</span> <span class="vs">r"$\hat{\theta}_0$"</span>)</span>
+<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a>plt.legend()<span class="op">;</span></span>
+<span id="cb9-21"><a href="#cb9-21" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.show()</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="loss_transformations_files/figure-html/cell-9-output-1.png" width="609" height="448" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<div id="9aa60152" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="9">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="co"># SLR + MSE</span></span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_linear(theta_0, theta_1, data_linear):</span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>    data_x, data_y <span class="op">=</span> data_linear.iloc[:, <span class="dv">0</span>], data_linear.iloc[:, <span class="dv">1</span>]</span>
+<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(</span>
+<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>        np.array([(y <span class="op">-</span> (theta_0 <span class="op">+</span> theta_1 <span class="op">*</span> x)) <span class="op">**</span> <span class="dv">2</span> <span class="cf">for</span> x, y <span class="kw">in</span> <span class="bu">zip</span>(data_x, data_y)]),</span>
+<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a>        axis<span class="op">=</span><span class="dv">0</span>,</span>
+<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a><span class="co"># plotting the loss surface</span></span>
+<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a>theta_0_values <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">80</span>, <span class="dv">20</span>, <span class="dv">80</span>)</span>
+<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a>theta_1_values <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">10</span>, <span class="dv">30</span>, <span class="dv">80</span>)</span>
+<span id="cb10-13"><a href="#cb10-13" aria-hidden="true" tabindex="-1"></a>mse_values <span class="op">=</span> np.array(</span>
+<span id="cb10-14"><a href="#cb10-14" aria-hidden="true" tabindex="-1"></a>    [[mse_linear(x, y, data_linear) <span class="cf">for</span> x <span class="kw">in</span> theta_0_values] <span class="cf">for</span> y <span class="kw">in</span> theta_1_values]</span>
+<span id="cb10-15"><a href="#cb10-15" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb10-16"><a href="#cb10-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-17"><a href="#cb10-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Optimal point</span></span>
+<span id="cb10-18"><a href="#cb10-18" aria-hidden="true" tabindex="-1"></a>data_x, data_y <span class="op">=</span> data_linear.iloc[:, <span class="dv">0</span>], data_linear.iloc[:, <span class="dv">1</span>]</span>
+<span id="cb10-19"><a href="#cb10-19" aria-hidden="true" tabindex="-1"></a>theta_1_hat <span class="op">=</span> np.corrcoef(data_x, data_y)[<span class="dv">0</span>, <span class="dv">1</span>] <span class="op">*</span> np.std(data_y) <span class="op">/</span> np.std(data_x)</span>
+<span id="cb10-20"><a href="#cb10-20" aria-hidden="true" tabindex="-1"></a>theta_0_hat <span class="op">=</span> np.mean(data_y) <span class="op">-</span> theta_1_hat <span class="op">*</span> np.mean(data_x)</span>
+<span id="cb10-21"><a href="#cb10-21" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-22"><a href="#cb10-22" aria-hidden="true" tabindex="-1"></a><span class="co"># Create the 3D plot</span></span>
+<span id="cb10-23"><a href="#cb10-23" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> plt.figure(figsize<span class="op">=</span>(<span class="dv">7</span>, <span class="dv">5</span>))</span>
+<span id="cb10-24"><a href="#cb10-24" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> fig.add_subplot(<span class="dv">111</span>, projection<span class="op">=</span><span class="st">"3d"</span>)</span>
+<span id="cb10-25"><a href="#cb10-25" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-26"><a href="#cb10-26" aria-hidden="true" tabindex="-1"></a>X, Y <span class="op">=</span> np.meshgrid(theta_0_values, theta_1_values)</span>
+<span id="cb10-27"><a href="#cb10-27" aria-hidden="true" tabindex="-1"></a>surf <span class="op">=</span> ax.plot_surface(</span>
+<span id="cb10-28"><a href="#cb10-28" aria-hidden="true" tabindex="-1"></a>    X, Y, mse_values, cmap<span class="op">=</span><span class="st">"viridis"</span>, alpha<span class="op">=</span><span class="fl">0.6</span></span>
+<span id="cb10-29"><a href="#cb10-29" aria-hidden="true" tabindex="-1"></a>)  <span class="co"># Use alpha to make it slightly transparent</span></span>
+<span id="cb10-30"><a href="#cb10-30" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-31"><a href="#cb10-31" aria-hidden="true" tabindex="-1"></a><span class="co"># Scatter point using matplotlib</span></span>
+<span id="cb10-32"><a href="#cb10-32" aria-hidden="true" tabindex="-1"></a>sc <span class="op">=</span> ax.scatter(</span>
+<span id="cb10-33"><a href="#cb10-33" aria-hidden="true" tabindex="-1"></a>    [theta_0_hat],</span>
+<span id="cb10-34"><a href="#cb10-34" aria-hidden="true" tabindex="-1"></a>    [theta_1_hat],</span>
+<span id="cb10-35"><a href="#cb10-35" aria-hidden="true" tabindex="-1"></a>    [mse_linear(theta_0_hat, theta_1_hat, data_linear)],</span>
+<span id="cb10-36"><a href="#cb10-36" aria-hidden="true" tabindex="-1"></a>    marker<span class="op">=</span><span class="st">"o"</span>,</span>
+<span id="cb10-37"><a href="#cb10-37" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">"red"</span>,</span>
+<span id="cb10-38"><a href="#cb10-38" aria-hidden="true" tabindex="-1"></a>    s<span class="op">=</span><span class="dv">100</span>,</span>
+<span id="cb10-39"><a href="#cb10-39" aria-hidden="true" tabindex="-1"></a>    label<span class="op">=</span><span class="st">"theta hat"</span>,</span>
+<span id="cb10-40"><a href="#cb10-40" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb10-41"><a href="#cb10-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-42"><a href="#cb10-42" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a colorbar</span></span>
+<span id="cb10-43"><a href="#cb10-43" aria-hidden="true" tabindex="-1"></a>cbar <span class="op">=</span> fig.colorbar(surf, ax<span class="op">=</span>ax, shrink<span class="op">=</span><span class="fl">0.5</span>, aspect<span class="op">=</span><span class="dv">10</span>)</span>
+<span id="cb10-44"><a href="#cb10-44" aria-hidden="true" tabindex="-1"></a>cbar.set_label(<span class="st">"Cost Value"</span>)</span>
+<span id="cb10-45"><a href="#cb10-45" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-46"><a href="#cb10-46" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">"MSE for different $</span><span class="ch">\\</span><span class="st">theta_0, </span><span class="ch">\\</span><span class="st">theta_1$"</span>)</span>
+<span id="cb10-47"><a href="#cb10-47" aria-hidden="true" tabindex="-1"></a>ax.set_xlabel(<span class="st">"$</span><span class="ch">\\</span><span class="st">theta_0$"</span>)</span>
+<span id="cb10-48"><a href="#cb10-48" aria-hidden="true" tabindex="-1"></a>ax.set_ylabel(<span class="st">"$</span><span class="ch">\\</span><span class="st">theta_1$"</span>)</span>
+<span id="cb10-49"><a href="#cb10-49" aria-hidden="true" tabindex="-1"></a>ax.set_zlabel(<span class="st">"MSE"</span>)</span>
+<span id="cb10-50"><a href="#cb10-50" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-51"><a href="#cb10-51" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.show()</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<pre><code>Text(0.5, 0, 'MSE')</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="loss_transformations_files/figure-html/cell-10-output-2.png" width="558" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<div id="8f75713b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="10">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Predictions</span></span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>yobs <span class="op">=</span> data_linear[<span class="st">"Age"</span>]  <span class="co"># The true observations y</span></span>
+<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>xs <span class="op">=</span> data_linear[<span class="st">"Length"</span>]  <span class="co"># Needed for linear predictions</span></span>
+<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(yobs)  <span class="co"># Predictions</span></span>
+<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a>yhats_constant <span class="op">=</span> [thetahat <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(n)]  <span class="co"># Not used, but food for thought</span></span>
+<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a>yhats_linear <span class="op">=</span> [theta_0_hat <span class="op">+</span> theta_1_hat <span class="op">*</span> x <span class="cf">for</span> x <span class="kw">in</span> xs]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="1e0359c3" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="11">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Constant Model Rug Plot</span></span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="co"># In case we're in a weird style state</span></span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>sns.set_theme()</span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>adjust_fontsize(size<span class="op">=</span><span class="dv">16</span>)</span>
+<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
+<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> plt.figure(figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="fl">1.5</span>))</span>
+<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a>sns.rugplot(yobs, height<span class="op">=</span><span class="fl">0.25</span>, lw<span class="op">=</span><span class="dv">2</span>) <span class="op">;</span></span>
+<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a>plt.axvline(thetahat, color<span class="op">=</span><span class="st">'red'</span>, lw<span class="op">=</span><span class="dv">4</span>, label<span class="op">=</span><span class="vs">r"$\hat{\theta}_0$"</span>)<span class="op">;</span></span>
+<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a>plt.legend()</span>
+<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a>plt.yticks([])<span class="op">;</span></span>
+<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.show()</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="loss_transformations_files/figure-html/cell-12-output-1.png" width="640" height="194" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<div id="acd7583e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="12">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># SLR model scatter plot </span></span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="co"># In case we're in a weird style state</span></span>
+<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>sns.set_theme()</span>
+<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a>adjust_fontsize(size<span class="op">=</span><span class="dv">16</span>)</span>
+<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
+<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(x<span class="op">=</span>xs, y<span class="op">=</span>yobs)</span>
+<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a>plt.plot(xs, yhats_linear, color<span class="op">=</span><span class="st">'red'</span>, lw<span class="op">=</span><span class="dv">4</span>)<span class="op">;</span></span>
+<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.savefig('dugong_line.png', bbox_inches = 'tight');</span></span>
+<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.show()</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="loss_transformations_files/figure-html/cell-13-output-1.png" width="579" height="448" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Interpreting the RMSE (Root Mean Squared Error):</p>
+<ul>
+<li>Because the constant error is <strong>HIGHER</strong> than the linear error,</li>
+<li>The constant model is <strong>WORSE</strong> than the linear model (at least for this metric).</li>
+</ul>
+</section>
+</section>
+<section id="constant-model-mae" class="level2" data-number="11.3">
+<h2 data-number="11.3" class="anchored" data-anchor-id="constant-model-mae"><span class="header-section-number">11.3</span> Constant Model + MAE</h2>
+<p>We see now that changing the model used for prediction leads to a wildly different result for the optimal model parameter. What happens if we instead change the loss function used in model evaluation?</p>
+<p>This time, we will consider the constant model with L1 (absolute loss) as the loss function. This means that the average loss will be expressed as the <strong>Mean Absolute Error (MAE)</strong>.</p>
+<ol type="1">
+<li>Choose a model: constant model</li>
+<li>Choose a loss function: L1 loss</li>
+<li>Fit the model</li>
+<li>Evaluate model performance</li>
+</ol>
+<section id="deriving-the-optimal-theta_0-1" class="level3" data-number="11.3.1">
+<h3 data-number="11.3.1" class="anchored" data-anchor-id="deriving-the-optimal-theta_0-1"><span class="header-section-number">11.3.1</span> Deriving the optimal <span class="math inline">\(\theta_0\)</span></h3>
+<p>Recall that the MAE is average <strong>absolute</strong> loss (L1 loss) over the data <span class="math inline">\(D = \{y_1, y_2, ..., y_n\}\)</span>.</p>
+<p><span class="math display">\[\hat{R}(\theta_0) = \frac{1}{n}\sum^{n}_{i=1} |y_i - \hat{y_i}| \]</span></p>
+<p>Given the constant model <span class="math inline">\(\hat{y} = \theta_0\)</span>, we can write the MAE as:</p>
+<p><span class="math display">\[\hat{R}(\theta_0) = \frac{1}{n}\sum^{n}_{i=1} |y_i - \theta_0| \]</span></p>
+<p>To fit the model, we find the optimal parameter value <span class="math inline">\(\hat{\theta_0}\)</span> that minimizes the MAE by differentiating using a calculus approach:</p>
+<ol type="1">
+<li>Differentiate with respect to <span class="math inline">\(\hat{\theta_0}\)</span>:</li>
+</ol>
+<p><span class="math display">\[
+\begin{align}
+\hat{R}(\theta_0) &amp;= \frac{1}{n}\sum^{n}_{i=1} |y_i - \theta_0| \\
+\frac{d}{d\theta_0} R(\theta_0) &amp;= \frac{d}{d\theta_0} \left(\frac{1}{n} \sum^{n}_{i=1} |y_i - \theta_0| \right) \\
+&amp;= \frac{1}{n} \sum^{n}_{i=1} \frac{d}{d\theta_0} |y_i - \theta_0|
+\end{align}
+\]</span></p>
+<ul>
+<li>Here, we seem to have run into a problem: the derivative of an absolute value is undefined when the argument is 0 (i.e.&nbsp;when <span class="math inline">\(y_i = \theta_0\)</span>). For now, we’ll ignore this issue. It turns out that disregarding this case doesn’t influence our final result.</li>
+<li>To perform the derivative, consider two cases. When <span class="math inline">\(\theta_0\)</span> is <em>less than or equal to</em> <span class="math inline">\(y_i\)</span>, the term <span class="math inline">\(y_i - \theta_0\)</span> will be positive and the absolute value has no impact. When <span class="math inline">\(\theta_0\)</span> is <em>greater than</em> <span class="math inline">\(y_i\)</span>, the term <span class="math inline">\(y_i - \theta_0\)</span> will be negative. Applying the absolute value will convert this to a positive value, which we can express by saying <span class="math inline">\(-(y_i - \theta_0) = \theta_0 - y_i\)</span>.</li>
+</ul>
+<p><span class="math display">\[|y_i - \theta_0| = \begin{cases} y_i - \theta_0 \quad \text{ if } \theta_0 \le y_i \\ \theta_0 - y_i \quad \text{if }\theta_0 &gt; y_i \end{cases}\]</span></p>
+<ul>
+<li>Taking derivatives:</li>
+</ul>
+<p><span class="math display">\[\frac{d}{d\theta_0} |y_i - \theta_0| = \begin{cases} \frac{d}{d\theta_0} (y_i - \theta_0) = -1 \quad \text{if }\theta_0 &lt; y_i \\ \frac{d}{d\theta_0} (\theta_0 - y_i) = 1 \quad \text{if }\theta_0 &gt; y_i \end{cases}\]</span></p>
+<ul>
+<li>This means that we obtain a different value for the derivative for data points where <span class="math inline">\(\theta_0 &lt; y_i\)</span> and where <span class="math inline">\(\theta_0 &gt; y_i\)</span>. We can summarize this by saying:</li>
+</ul>
+<p><span class="math display">\[
+\frac{d}{d\theta_0} R(\theta_0) = \frac{1}{n} \sum^{n}_{i=1} \frac{d}{d\theta_0} |y_i - \theta_0| \\
+= \frac{1}{n} \left[\sum_{\theta_0 &lt; y_i} (-1) + \sum_{\theta_0 &gt; y_i} (+1) \right]
+\]</span></p>
+<ul>
+<li>In other words, we take the sum of values for <span class="math inline">\(i = 1, 2, ..., n\)</span>:
+<ul>
+<li><span class="math inline">\(-1\)</span> if our observation <span class="math inline">\(y_i\)</span> is <em>greater than</em> our prediction <span class="math inline">\(\hat{\theta_0}\)</span></li>
+<li><span class="math inline">\(+1\)</span> if our observation <span class="math inline">\(y_i\)</span> is <em>smaller than</em> our prediction <span class="math inline">\(\hat{\theta_0}\)</span></li>
+</ul></li>
+</ul>
+<ol start="2" type="1">
+<li><p>Set the derivative equation equal to 0: <span class="math display">\[ 0 = \frac{1}{n}\sum_{\hat{\theta_0} &lt; y_i} (-1) + \frac{1}{n}\sum_{\hat{\theta_0} &gt; y_i} (+1) \]</span></p></li>
+<li><p>Solve for <span class="math inline">\(\hat{\theta_0}\)</span>: <span class="math display">\[ 0 = -\frac{1}{n}\sum_{\hat{\theta_0} &lt; y_i} (1) + \frac{1}{n}\sum_{\hat{\theta_0} &gt; y_i} (1)\]</span></p></li>
+</ol>
+<p><span class="math display">\[\sum_{\hat{\theta_0} &lt; y_i} (1) = \sum_{\hat{\theta_0} &gt; y_i} (1) \]</span></p>
+<p>Thus, the constant model parameter <span class="math inline">\(\theta = \hat{\theta_0}\)</span> that minimizes MAE must satisfy:</p>
+<p><span class="math display">\[ \sum_{\hat{\theta_0} &lt; y_i} (1) = \sum_{\hat{\theta_0} &gt; y_i} (1) \]</span></p>
+<p>In other words, the number of observations greater than <span class="math inline">\(\theta_0\)</span> must be equal to the number of observations less than <span class="math inline">\(\theta_0\)</span>; there must be an equal number of points on the left and right sides of the equation. This is the definition of median, so our optimal value is <span class="math display">\[ \hat{\theta_0} = median(y) \]</span></p>
+</section>
+</section>
+<section id="summary-loss-optimization-calculus-and-critical-points" class="level2" data-number="11.4">
+<h2 data-number="11.4" class="anchored" data-anchor-id="summary-loss-optimization-calculus-and-critical-points"><span class="header-section-number">11.4</span> Summary: Loss Optimization, Calculus, and Critical Points</h2>
+<p>First, define the <strong>objective function</strong> as average loss.</p>
+<ul>
+<li>Plug in L1 or L2 loss.</li>
+<li>Plug in the model so that the resulting expression is a function of <span class="math inline">\(\theta\)</span>.</li>
+</ul>
+<p>Then, find the minimum of the objective function:</p>
+<ol type="1">
+<li>Differentiate with respect to <span class="math inline">\(\theta\)</span>.</li>
+<li>Set equal to 0.</li>
+<li>Solve for <span class="math inline">\(\hat{\theta}\)</span>.</li>
+<li>(If we have multiple parameters) repeat steps 1-3 with partial derivatives.</li>
+</ol>
+<p>Recall critical points from calculus: <span class="math inline">\(R(\hat{\theta})\)</span> could be a minimum, maximum, or saddle point!</p>
+<ul>
+<li>We should technically also perform the second derivative test, i.e., show <span class="math inline">\(R''(\hat{\theta}) &gt; 0\)</span>.</li>
+<li>MSE has a property—<strong>convexity</strong>—that guarantees that <span class="math inline">\(R(\hat{\theta})\)</span> is a global minimum.</li>
+<li>The proof of convexity for MAE is beyond this course.</li>
+</ul>
+</section>
+<section id="comparing-loss-functions" class="level2" data-number="11.5">
+<h2 data-number="11.5" class="anchored" data-anchor-id="comparing-loss-functions"><span class="header-section-number">11.5</span> Comparing Loss Functions</h2>
+<p>We’ve now tried our hand at fitting a model under both MSE and MAE cost functions. How do the two results compare?</p>
+<p>Let’s consider a dataset where each entry represents the number of drinks sold at a bubble tea store each day. We’ll fit a constant model to predict the number of drinks that will be sold tomorrow.</p>
+<div id="5e85f72e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>drinks <span class="op">=</span> np.array([<span class="dv">20</span>, <span class="dv">21</span>, <span class="dv">22</span>, <span class="dv">29</span>, <span class="dv">33</span>])</span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>drinks</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="13">
+<pre><code>array([20, 21, 22, 29, 33])</code></pre>
+</div>
+</div>
+<p>From our derivations above, we know that the optimal model parameter under MSE cost is the mean of the dataset. Under MAE cost, the optimal parameter is the median of the dataset.</p>
+<div id="462a2391" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>np.mean(drinks), np.median(drinks)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<pre><code>(np.float64(25.0), np.float64(22.0))</code></pre>
+</div>
+</div>
+<p>If we plot each empirical risk function across several possible values of <span class="math inline">\(\theta\)</span>, we find that each <span class="math inline">\(\hat{\theta}\)</span> does indeed correspond to the lowest value of error:</p>
+<p><img src="images/error.png" alt="error" width="600"></p>
+<p>Notice that the MSE above is a <strong>smooth</strong> function – it is differentiable at all points, making it easy to minimize using numerical methods. The MAE, in contrast, is not differentiable at each of its “kinks.” We’ll explore how the smoothness of the cost function can impact our ability to apply numerical optimization in a few weeks.</p>
+<p>How do outliers affect each cost function? Imagine we replace the largest value in the dataset with 1000. The mean of the data increases substantially, while the median is nearly unaffected.</p>
+<div id="bd305258" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>drinks_with_outlier <span class="op">=</span> np.append(drinks, <span class="dv">1033</span>)</span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>display(drinks_with_outlier)</span>
+<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>np.mean(drinks_with_outlier), np.median(drinks_with_outlier)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<pre><code>array([  20,   21,   22,   29,   33, 1033])</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="15">
+<pre><code>(np.float64(193.0), np.float64(25.5))</code></pre>
+</div>
+</div>
+<p><img src="images/outliers.png" alt="outliers" width="700"></p>
+<p>This means that under the MSE, the optimal model parameter <span class="math inline">\(\hat{\theta}\)</span> is strongly affected by the presence of outliers. Under the MAE, the optimal parameter is not as influenced by outlying data. We can generalize this by saying that the MSE is <strong>sensitive</strong> to outliers, while the MAE is <strong>robust</strong> to outliers.</p>
+<p>Let’s try another experiment. This time, we’ll add an additional, non-outlying datapoint to the data.</p>
+<div id="c133647b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>drinks_with_additional_observation <span class="op">=</span> np.append(drinks, <span class="dv">35</span>)</span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>drinks_with_additional_observation</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="16">
+<pre><code>array([20, 21, 22, 29, 33, 35])</code></pre>
+</div>
+</div>
+<p>When we again visualize the cost functions, we find that the MAE now plots a horizontal line between 22 and 29. This means that there are <em>infinitely</em> many optimal values for the model parameter: any value <span class="math inline">\(\hat{\theta} \in [22, 29]\)</span> will minimize the MAE. In contrast, the MSE still has a single best value for <span class="math inline">\(\hat{\theta}\)</span>. In other words, the MSE has a <strong>unique</strong> solution for <span class="math inline">\(\hat{\theta}\)</span>; the MAE is not guaranteed to have a single unique solution.</p>
+<p><img src="images/mse_loss_26.png" width="350"> <img src="images/mae_loss_infinite.png" width="350"></p>
+<p>To summarize our example,</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 13%">
+<col style="width: 38%">
+<col style="width: 47%">
+</colgroup>
+<thead>
+<tr class="header">
+<th></th>
+<th>MSE (Mean Squared Loss)</th>
+<th>MAE (Mean Absolute Loss)</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>Loss Function</td>
+<td><span class="math inline">\(\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2\)</span></td>
+<td><span class="math inline">\(\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} |y_i - \theta_0|\)</span></td>
+</tr>
+<tr class="even">
+<td>Optimal <span class="math inline">\(\hat{\theta_0}\)</span></td>
+<td><span class="math inline">\(\hat{\theta_0} = mean(y) = \bar{y}\)</span></td>
+<td><span class="math inline">\(\hat{\theta_0} = median(y)\)</span></td>
+</tr>
+<tr class="odd">
+<td>Loss Surface</td>
+<td><img src="images/mse_loss_26.png" width="250"></td>
+<td><img src="images/mae_loss_infinite.png" width="250"></td>
+</tr>
+<tr class="even">
+<td>Shape</td>
+<td><strong>Smooth</strong> - easy to minimize using numerical methods (in a few weeks)</td>
+<td><strong>Piecewise</strong> - at each of the “kinks,” it’s not differentiable. Harder to minimize.</td>
+</tr>
+<tr class="odd">
+<td>Outliers</td>
+<td><strong>Sensitive</strong> to outliers (since they change mean substantially). Sensitivity also depends on the dataset size.</td>
+<td><strong>More robust</strong> to outliers.</td>
+</tr>
+<tr class="even">
+<td><span class="math inline">\(\hat{\theta_0}\)</span> Uniqueness</td>
+<td><strong>Unique</strong> <span class="math inline">\(\hat{\theta_0}\)</span></td>
+<td><strong>Infinitely many</strong> <span class="math inline">\(\hat{\theta_0}\)</span>s</td>
+</tr>
+</tbody>
+</table>
+</section>
+<section id="transformations-to-fit-linear-models" class="level2" data-number="11.6">
+<h2 data-number="11.6" class="anchored" data-anchor-id="transformations-to-fit-linear-models"><span class="header-section-number">11.6</span> Transformations to fit Linear Models</h2>
+<p>At this point, we have an effective method of fitting models to predict linear relationships. Given a feature variable and target, we can apply our four-step process to find the optimal model parameters.</p>
+<p>A key word above is <em>linear</em>. When we computed parameter estimates earlier, we assumed that <span class="math inline">\(x_i\)</span> and <span class="math inline">\(y_i\)</span> shared a roughly linear relationship. Data in the real world isn’t always so straightforward, but we can transform the data to try and obtain linearity.</p>
+<p>The <strong>Tukey-Mosteller Bulge Diagram</strong> is a useful tool for summarizing what transformations can linearize the relationship between two variables. To determine what transformations might be appropriate, trace the shape of the “bulge” made by your data. Find the quadrant of the diagram that matches this bulge. The transformations shown on the vertical and horizontal axes of this quadrant can help improve the fit between the variables.</p>
+<p><img src="images/bulge.png" alt="bulge" width="600"></p>
+<p>Note that:</p>
+<ul>
+<li>There are multiple solutions. Some will fit better than others.</li>
+<li>sqrt and log make a value “smaller.”</li>
+<li>Raising to a power makes a value “bigger.”</li>
+<li>Each of these transformations equates to increasing or decreasing the scale of an axis.</li>
+</ul>
+<p>Other goals in addition to linearity are possible, for example, making data appear more symmetric. Linearity allows us to fit lines to the transformed data.</p>
+<p>Let’s revisit our dugongs example. The lengths and ages are plotted below:</p>
+<div id="75c4b077" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="17">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># `corrcoef` computes the correlation coefficient between two variables</span></span>
+<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a><span class="co"># `std` finds the standard deviation</span></span>
+<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> dugongs[<span class="st">"Length"</span>]</span>
+<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> dugongs[<span class="st">"Age"</span>]</span>
+<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a>r <span class="op">=</span> np.corrcoef(x, y)[<span class="dv">0</span>, <span class="dv">1</span>]</span>
+<span id="cb24-6"><a href="#cb24-6" aria-hidden="true" tabindex="-1"></a>theta_1 <span class="op">=</span> r <span class="op">*</span> np.std(y) <span class="op">/</span> np.std(x)</span>
+<span id="cb24-7"><a href="#cb24-7" aria-hidden="true" tabindex="-1"></a>theta_0 <span class="op">=</span> np.mean(y) <span class="op">-</span> theta_1 <span class="op">*</span> np.mean(x)</span>
+<span id="cb24-8"><a href="#cb24-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb24-9"><a href="#cb24-9" aria-hidden="true" tabindex="-1"></a>fig, ax <span class="op">=</span> plt.subplots(<span class="dv">1</span>, <span class="dv">2</span>, dpi<span class="op">=</span><span class="dv">200</span>, figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="dv">3</span>))</span>
+<span id="cb24-10"><a href="#cb24-10" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].scatter(x, y)</span>
+<span id="cb24-11"><a href="#cb24-11" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].set_xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb24-12"><a href="#cb24-12" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].set_ylabel(<span class="st">"Age"</span>)</span>
+<span id="cb24-13"><a href="#cb24-13" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb24-14"><a href="#cb24-14" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].scatter(x, y)</span>
+<span id="cb24-15"><a href="#cb24-15" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].plot(x, theta_0 <span class="op">+</span> theta_1 <span class="op">*</span> x, <span class="st">"tab:red"</span>)</span>
+<span id="cb24-16"><a href="#cb24-16" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].set_xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb24-17"><a href="#cb24-17" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].set_ylabel(<span class="st">"Age"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="17">
+<pre><code>Text(0, 0.5, 'Age')</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="loss_transformations_files/figure-html/cell-18-output-2.png" width="1407" height="621" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Looking at the plot on the left, we see that there is a slight curvature to the data points. Plotting the SLR curve on the right results in a poor fit.</p>
+<p>For SLR to perform well, we’d like there to be a rough linear trend relating <code>"Age"</code> and <code>"Length"</code>. What is making the raw data deviate from a linear relationship? Notice that the data points with <code>"Length"</code> greater than 2.6 have disproportionately high values of <code>"Age"</code> relative to the rest of the data. If we could manipulate these data points to have lower <code>"Age"</code> values, we’d “shift” these points downwards and reduce the curvature in the data. Applying a logarithmic transformation to <span class="math inline">\(y_i\)</span> (that is, taking <span class="math inline">\(\log(\)</span> <code>"Age"</code> <span class="math inline">\()\)</span> ) would achieve just that.</p>
+<p>An important word on <span class="math inline">\(\log\)</span>: in Data 100 (and most upper-division STEM courses), <span class="math inline">\(\log\)</span> denotes the natural logarithm with base <span class="math inline">\(e\)</span>. The base-10 logarithm, where relevant, is indicated by <span class="math inline">\(\log_{10}\)</span>.</p>
+<div id="b7c0bbba" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="18">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>z <span class="op">=</span> np.log(y)</span>
+<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a>r <span class="op">=</span> np.corrcoef(x, z)[<span class="dv">0</span>, <span class="dv">1</span>]</span>
+<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a>theta_1 <span class="op">=</span> r <span class="op">*</span> np.std(z) <span class="op">/</span> np.std(x)</span>
+<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a>theta_0 <span class="op">=</span> np.mean(z) <span class="op">-</span> theta_1 <span class="op">*</span> np.mean(x)</span>
+<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a>fig, ax <span class="op">=</span> plt.subplots(<span class="dv">1</span>, <span class="dv">2</span>, dpi<span class="op">=</span><span class="dv">200</span>, figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="dv">3</span>))</span>
+<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].scatter(x, z)</span>
+<span id="cb26-9"><a href="#cb26-9" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].set_xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb26-10"><a href="#cb26-10" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].set_ylabel(<span class="vs">r"$\log{(Age)}$"</span>)</span>
+<span id="cb26-11"><a href="#cb26-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb26-12"><a href="#cb26-12" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].scatter(x, z)</span>
+<span id="cb26-13"><a href="#cb26-13" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].plot(x, theta_0 <span class="op">+</span> theta_1 <span class="op">*</span> x, <span class="st">"tab:red"</span>)</span>
+<span id="cb26-14"><a href="#cb26-14" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].set_xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb26-15"><a href="#cb26-15" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].set_ylabel(<span class="vs">r"$\log{(Age)}$"</span>)</span>
+<span id="cb26-16"><a href="#cb26-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb26-17"><a href="#cb26-17" aria-hidden="true" tabindex="-1"></a>plt.subplots_adjust(wspace<span class="op">=</span><span class="fl">0.3</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="loss_transformations_files/figure-html/cell-19-output-1.png" width="1387" height="621" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Our SLR fit looks a lot better! We now have a new target variable: the SLR model is now trying to predict the <em>log</em> of <code>"Age"</code>, rather than the untransformed <code>"Age"</code>. In other words, we are applying the transformation <span class="math inline">\(z_i = \log{(y_i)}\)</span>. Notice that the resulting model is still <strong>linear in the parameters</strong> <span class="math inline">\(\theta = [\theta_0, \theta_1]\)</span>. The SLR model becomes:</p>
+<p><span class="math display">\[\hat{\log{y}} = \theta_0 + \theta_1 x\]</span> <span class="math display">\[\hat{z} = \theta_0 + \theta_1 x\]</span></p>
+<p>It turns out that this linearized relationship can help us understand the underlying relationship between <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span>. If we rearrange the relationship above, we find:</p>
+<p><span class="math display">\[\log{(y)} = \theta_0 + \theta_1 x\]</span> <span class="math display">\[y = e^{\theta_0 + \theta_1 x}\]</span> <span class="math display">\[y = (e^{\theta_0})e^{\theta_1 x}\]</span> <span class="math display">\[y_i = C e^{k x}\]</span></p>
+<p>For some constants <span class="math inline">\(C\)</span> and <span class="math inline">\(k\)</span>.</p>
+<p><span class="math inline">\(y\)</span> is an <em>exponential</em> function of <span class="math inline">\(x\)</span>. Applying an exponential fit to the untransformed variables corroborates this finding.</p>
+<div id="b21b544c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="19">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>plt.figure(dpi<span class="op">=</span><span class="dv">120</span>, figsize<span class="op">=</span>(<span class="dv">4</span>, <span class="dv">3</span>))</span>
+<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a>plt.scatter(x, y)</span>
+<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a>plt.plot(x, np.exp(theta_0) <span class="op">*</span> np.exp(theta_1 <span class="op">*</span> x), <span class="st">"tab:red"</span>)</span>
+<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb27-6"><a href="#cb27-6" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Age"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="19">
+<pre><code>Text(0, 0.5, 'Age')</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="loss_transformations_files/figure-html/cell-20-output-2.png" width="472" height="371" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>You may wonder: why did we choose to apply a log transformation specifically? Why not some other function to linearize the data?</p>
+<p>Practically, many other mathematical operations that modify the relative scales of <code>"Age"</code> and <code>"Length"</code> could have worked here.</p>
+</section>
+<section id="multiple-linear-regression" class="level2" data-number="11.7">
+<h2 data-number="11.7" class="anchored" data-anchor-id="multiple-linear-regression"><span class="header-section-number">11.7</span> Multiple Linear Regression</h2>
+<p>Multiple linear regression is an extension of simple linear regression that adds additional features to the model. The multiple linear regression model takes the form:</p>
+<p><span class="math display">\[\hat{y} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:...\:+\:\theta_p x_{p}\]</span></p>
+<p>Our predicted value of <span class="math inline">\(y\)</span>, <span class="math inline">\(\hat{y}\)</span>, is a linear combination of the single <strong>observations</strong> (features), <span class="math inline">\(x_i\)</span>, and the parameters, <span class="math inline">\(\theta_i\)</span>.</p>
+<p>We’ll dive deeper into Multiple Linear Regression in the next lecture.</p>
+</section>
+<section id="bonus-calculating-constant-model-mse-using-an-algebraic-trick" class="level2" data-number="11.8">
+<h2 data-number="11.8" class="anchored" data-anchor-id="bonus-calculating-constant-model-mse-using-an-algebraic-trick"><span class="header-section-number">11.8</span> Bonus: Calculating Constant Model MSE Using an Algebraic Trick</h2>
+<p>Earlier, we calculated the constant model MSE using calculus. It turns out that there is a much more elegant way of performing this same minimization algebraically, without using calculus at all.</p>
+<p>In this calculation, we use the fact that the <strong>sum of deviations from the mean is <span class="math inline">\(0\)</span></strong> or that <span class="math inline">\(\sum_{i=1}^{n} (y_i - \bar{y}) = 0\)</span>.</p>
+<p>Let’s quickly walk through the proof for this: <span class="math display">\[
+\begin{align}
+\sum_{i=1}^{n} (y_i - \bar{y}) &amp;= \sum_{i=1}^{n} y_i - \sum_{i=1}^{n} \bar{y} \\
+&amp;= \sum_{i=1}^{n} y_i - n\bar{y} \\
+&amp;= \sum_{i=1}^{n} y_i - n\frac{1}{n}\sum_{i=1}^{n}y_i \\
+&amp;= \sum_{i=1}^{n} y_i - \sum_{i=1}^{n}y_i \\
+&amp; = 0
+\end{align}
+\]</span></p>
+<p>In our calculations, we’ll also be using the definition of the variance as a sample. As a refresher:</p>
+<p><span class="math display">\[\sigma_y^2 = \frac{1}{n}\sum_{i=1}^{n} (y_i - \bar{y})^2\]</span></p>
+<p>Getting into our calculation for MSE minimization:</p>
+<p><span class="math display">\[
+\begin{align}
+R(\theta) &amp;= {\frac{1}{n}}\sum^{n}_{i=1} (y_i - \theta)^2
+\\ &amp;= \frac{1}{n}\sum^{n}_{i=1} [(y_i - \bar{y}) + (\bar{y} - \theta)]^2\quad \quad \text{using trick that a-b can be written as (a-c) + (c-b) } \\
+&amp;\quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \space \space \text{where a, b, and c are any numbers}
+\\ &amp;= \frac{1}{n}\sum^{n}_{i=1} [(y_i - \bar{y})^2 + 2(y_i - \bar{y})(\bar{y} - \theta) + (\bar{y} - \theta)^2]
+\\ &amp;= \frac{1}{n}[\sum^{n}_{i=1}(y_i - \bar{y})^2 + 2(\bar{y} - \theta)\sum^{n}_{i=1}(y_i - \bar{y}) + n(\bar{y} - \theta)^2] \quad \quad  \text{distribute sum to individual terms}
+\\ &amp;= \frac{1}{n}\sum^{n}_{i=1}(y_i - \bar{y})^2 + \frac{2}{n}(\bar{y} - \theta)\cdot0 + (\bar{y} - \theta)^2 \quad \quad  \text{sum of deviations from mean is 0}
+\\ &amp;= \sigma_y^2 + (\bar{y} - \theta)^2
+\end{align}
+\]</span></p>
+<p>Since variance can’t be negative, we know that our first term, <span class="math inline">\(\sigma_y^2\)</span> is greater than or equal to <span class="math inline">\(0\)</span>. Also note, that <strong>the first term doesn’t involve <span class="math inline">\(\theta\)</span> at all</strong>, meaning changing our model won’t change this value. For the purposes of determining $#, we can then essentially ignore this term.</p>
+<p>Looking at the second term, <span class="math inline">\((\bar{y} - \theta)^2\)</span>, since it is squared, we know it must be greater than or equal to <span class="math inline">\(0\)</span>. As this term does involve <span class="math inline">\(\theta\)</span>, picking the value of <span class="math inline">\(\theta\)</span> that minimizes this term will allow us to minimize our average loss. For the second term to equal <span class="math inline">\(0\)</span>, <span class="math inline">\(\theta = \bar{y}\)</span>, or in other words, <span class="math inline">\(\hat{\theta} = \bar{y} = mean(y)\)</span>.</p>
+<section id="note" class="level5" data-number="11.8.0.0.1">
+<h5 data-number="11.8.0.0.1" class="anchored" data-anchor-id="note"><span class="header-section-number">11.8.0.0.1</span> Note</h5>
+<p>In the derivation above, we decompose the expected loss, <span class="math inline">\(R(\theta)\)</span>, into two key components: the variance of the data, <span class="math inline">\(\sigma_y^2\)</span>, and the square of the bias, <span class="math inline">\((\bar{y} - \theta)^2\)</span>. This decomposition is insightful for understanding the behavior of estimators in statistical models.</p>
+<ul>
+<li><p><strong>Variance, <span class="math inline">\(\sigma_y^2\)</span></strong>: This term represents the spread of the data points around their mean, <span class="math inline">\(\bar{y}\)</span>, and is a measure of the data’s inherent variability. Importantly, it does not depend on the choice of <span class="math inline">\(\theta\)</span>, meaning it’s a fixed property of the data. Variance serves as an indicator of the data’s dispersion and is crucial in understanding the dataset’s structure, but it remains constant regardless of how we adjust our model parameter <span class="math inline">\(\theta\)</span>.</p></li>
+<li><p><strong>Bias Squared, <span class="math inline">\((\bar{y} - \theta)^2\)</span></strong>: This term captures the bias of the estimator, defined as the square of the difference between the mean of the data points, <span class="math inline">\(\bar{y}\)</span>, and the parameter <span class="math inline">\(\theta\)</span>. The bias quantifies the systematic error introduced when estimating <span class="math inline">\(\theta\)</span>. Minimizing this term is essential for improving the accuracy of the estimator. When <span class="math inline">\(\theta = \bar{y}\)</span>, the bias is <span class="math inline">\(0\)</span>, indicating that the estimator is unbiased for the parameter it estimates. This highlights a critical principle in statistical estimation: choosing <span class="math inline">\(\theta\)</span> to be the sample mean, <span class="math inline">\(\bar{y}\)</span>, minimizes the average loss, rendering the estimator both efficient and unbiased for the population mean.</p></li>
+</ul>
+
+
+<!-- -->
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../intro_to_modeling/intro_to_modeling.html" class="pagination-link" aria-label="Introduction to Modeling">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../ols/ols.html" class="pagination-link" aria-label="Ordinary Least Squares">
+        <span class="nav-page-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb29" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> 'Constant Model, Loss, and Transformations'</span></span>
+<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb29-6"><a href="#cb29-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb29-7"><a href="#cb29-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
+<span id="cb29-8"><a href="#cb29-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb29-9"><a href="#cb29-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb29-10"><a href="#cb29-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: 'Constant Model, Loss, and Transformations'</span></span>
+<span id="cb29-11"><a href="#cb29-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb29-12"><a href="#cb29-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb29-13"><a href="#cb29-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb29-14"><a href="#cb29-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb29-15"><a href="#cb29-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb29-16"><a href="#cb29-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb29-17"><a href="#cb29-17" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb29-18"><a href="#cb29-18" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb29-19"><a href="#cb29-19" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb29-20"><a href="#cb29-20" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb29-21"><a href="#cb29-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb29-22"><a href="#cb29-22" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb29-23"><a href="#cb29-23" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb29-24"><a href="#cb29-24" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb29-25"><a href="#cb29-25" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb29-26"><a href="#cb29-26" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb29-27"><a href="#cb29-27" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb29-28"><a href="#cb29-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-29"><a href="#cb29-29" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb29-30"><a href="#cb29-30" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-31"><a href="#cb29-31" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb29-32"><a href="#cb29-32" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-33"><a href="#cb29-33" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Derive the optimal model parameters for the constant model under MSE and MAE cost functions.</span>
+<span id="cb29-34"><a href="#cb29-34" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Evaluate the differences between MSE and MAE risk.</span>
+<span id="cb29-35"><a href="#cb29-35" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Understand the need for linearization of variables and apply the Tukey-Mosteller bulge diagram for transformations.</span>
+<span id="cb29-36"><a href="#cb29-36" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-37"><a href="#cb29-37" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb29-38"><a href="#cb29-38" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-39"><a href="#cb29-39" aria-hidden="true" tabindex="-1"></a>Last time, we introduced the modeling process. We set up a framework to predict target variables as functions of our features, following a set workflow:</span>
+<span id="cb29-40"><a href="#cb29-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-41"><a href="#cb29-41" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Choose a model - how should we represent the world?</span>
+<span id="cb29-42"><a href="#cb29-42" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Choose a loss function - how do we quantify prediction error?</span>
+<span id="cb29-43"><a href="#cb29-43" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Fit the model - how do we choose the best parameter of our model given our data?</span>
+<span id="cb29-44"><a href="#cb29-44" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>Evaluate model performance - how do we evaluate whether this process gave rise to a good model?</span>
+<span id="cb29-45"><a href="#cb29-45" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-46"><a href="#cb29-46" aria-hidden="true" tabindex="-1"></a>To illustrate this process, we derived the optimal model parameters under simple linear regression (SLR) with mean squared error (MSE) as the cost function. A summary of the SLR modeling process is shown below:</span>
+<span id="cb29-47"><a href="#cb29-47" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-48"><a href="#cb29-48" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb29-49"><a href="#cb29-49" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/slr_modeling.png" alt='modeling' width='600'&gt;</span>
+<span id="cb29-50"><a href="#cb29-50" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb29-51"><a href="#cb29-51" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-52"><a href="#cb29-52" aria-hidden="true" tabindex="-1"></a>In this lecture, we'll dive deeper into step 4 - evaluating model performance - using SLR as an example. Additionally, we'll also explore the modeling process with new models, continue familiarizing ourselves with the modeling process by finding the best model parameters under a new model, the constant model, and test out two different loss functions to understand how our choice of loss influences model design. Later on, we'll consider what happens when a linear model isn't the best choice to capture trends in our data and what solutions there are to create better models.</span>
+<span id="cb29-53"><a href="#cb29-53" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-54"><a href="#cb29-54" aria-hidden="true" tabindex="-1"></a>Before we get into Step 4, let's quickly review some important terminology.</span>
+<span id="cb29-55"><a href="#cb29-55" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-56"><a href="#cb29-56" aria-hidden="true" tabindex="-1"></a><span class="fu">### Prediction vs. Estimation</span></span>
+<span id="cb29-57"><a href="#cb29-57" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-58"><a href="#cb29-58" aria-hidden="true" tabindex="-1"></a>The terms prediction and estimation are often used somewhat interchangeably, but there is a subtle difference between them. **Estimation** is the task of using data to calculate model parameters. **Prediction** is the task of using a model to predict outputs for unseen data. In our simple linear regression model,</span>
+<span id="cb29-59"><a href="#cb29-59" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-60"><a href="#cb29-60" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \hat{\theta_0} + \hat{\theta_1}$$</span>
+<span id="cb29-61"><a href="#cb29-61" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-62"><a href="#cb29-62" aria-hidden="true" tabindex="-1"></a>we **estimate** the parameters by minimizing average loss; then, we **predict** using these estimations. **Least Squares Estimation** is when we choose the parameters that minimize MSE.</span>
+<span id="cb29-63"><a href="#cb29-63" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-64"><a href="#cb29-64" aria-hidden="true" tabindex="-1"></a><span class="fu">## Step 4: Evaluating the SLR Model</span></span>
+<span id="cb29-65"><a href="#cb29-65" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-66"><a href="#cb29-66" aria-hidden="true" tabindex="-1"></a>Now that we've explored the mathematics behind (1) choosing a model, (2) choosing a loss function, and (3) fitting the model, we're left with one final question – how "good" are the predictions made by this "best" fitted model? To determine this, we can:</span>
+<span id="cb29-67"><a href="#cb29-67" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-68"><a href="#cb29-68" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Visualize data and compute statistics:</span>
+<span id="cb29-69"><a href="#cb29-69" aria-hidden="true" tabindex="-1"></a><span class="ss">   - </span>Plot the original data.</span>
+<span id="cb29-70"><a href="#cb29-70" aria-hidden="true" tabindex="-1"></a><span class="ss">   - </span>Compute each column's mean and standard deviation. If the mean and standard deviation of our predictions are close to those of the original observed $y_i$'s, we might be inclined to say that our model has done well.</span>
+<span id="cb29-71"><a href="#cb29-71" aria-hidden="true" tabindex="-1"></a><span class="ss">   - </span>(If we're fitting a linear model) Compute the correlation $r$. A large magnitude for the correlation coefficient between the feature and response variables could also indicate that our model has done well.    </span>
+<span id="cb29-72"><a href="#cb29-72" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-73"><a href="#cb29-73" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Performance metrics:</span>
+<span id="cb29-74"><a href="#cb29-74" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-75"><a href="#cb29-75" aria-hidden="true" tabindex="-1"></a><span class="ss">   - </span>We can take the **Root Mean Squared Error (RMSE)**.</span>
+<span id="cb29-76"><a href="#cb29-76" aria-hidden="true" tabindex="-1"></a><span class="ss">     - </span>It's the square root of the mean squared error (MSE), which is the average loss that we've been minimizing to determine optimal model parameters.</span>
+<span id="cb29-77"><a href="#cb29-77" aria-hidden="true" tabindex="-1"></a><span class="ss">     - </span>RMSE is in the same units as $y$.</span>
+<span id="cb29-78"><a href="#cb29-78" aria-hidden="true" tabindex="-1"></a><span class="ss">     - </span>A lower RMSE indicates more "accurate" predictions, as we have a lower "average loss" across the data.</span>
+<span id="cb29-79"><a href="#cb29-79" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-80"><a href="#cb29-80" aria-hidden="true" tabindex="-1"></a>   $$\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2}$$</span>
+<span id="cb29-81"><a href="#cb29-81" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-82"><a href="#cb29-82" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Visualization:</span>
+<span id="cb29-83"><a href="#cb29-83" aria-hidden="true" tabindex="-1"></a><span class="ss">   - </span>Look at the residual plot of $e_i = y_i - \hat{y_i}$ to visualize the difference between actual and predicted values. The good residual plot should not show any pattern between input/features $x_i$ and residual values $e_i$.</span>
+<span id="cb29-84"><a href="#cb29-84" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-85"><a href="#cb29-85" aria-hidden="true" tabindex="-1"></a>To illustrate this process, let's take a look at **Anscombe's quartet**.</span>
+<span id="cb29-86"><a href="#cb29-86" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-87"><a href="#cb29-87" aria-hidden="true" tabindex="-1"></a><span class="fu">### Four Mysterious Datasets (Anscombe’s quartet)</span></span>
+<span id="cb29-88"><a href="#cb29-88" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-89"><a href="#cb29-89" aria-hidden="true" tabindex="-1"></a>Let's take a look at four different datasets.</span>
+<span id="cb29-90"><a href="#cb29-90" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-93"><a href="#cb29-93" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-94"><a href="#cb29-94" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-95"><a href="#cb29-95" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-96"><a href="#cb29-96" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb29-97"><a href="#cb29-97" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb29-98"><a href="#cb29-98" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb29-99"><a href="#cb29-99" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
+<span id="cb29-100"><a href="#cb29-100" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb29-101"><a href="#cb29-101" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> itertools</span>
+<span id="cb29-102"><a href="#cb29-102" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> mpl_toolkits.mplot3d <span class="im">import</span> Axes3D</span>
+<span id="cb29-103"><a href="#cb29-103" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-104"><a href="#cb29-104" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-107"><a href="#cb29-107" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-108"><a href="#cb29-108" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-109"><a href="#cb29-109" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-110"><a href="#cb29-110" aria-hidden="true" tabindex="-1"></a><span class="co"># Big font helper</span></span>
+<span id="cb29-111"><a href="#cb29-111" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> adjust_fontsize(size<span class="op">=</span><span class="va">None</span>):</span>
+<span id="cb29-112"><a href="#cb29-112" aria-hidden="true" tabindex="-1"></a>    SMALL_SIZE <span class="op">=</span> <span class="dv">8</span></span>
+<span id="cb29-113"><a href="#cb29-113" aria-hidden="true" tabindex="-1"></a>    MEDIUM_SIZE <span class="op">=</span> <span class="dv">10</span></span>
+<span id="cb29-114"><a href="#cb29-114" aria-hidden="true" tabindex="-1"></a>    BIGGER_SIZE <span class="op">=</span> <span class="dv">12</span></span>
+<span id="cb29-115"><a href="#cb29-115" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> size <span class="op">!=</span> <span class="va">None</span>:</span>
+<span id="cb29-116"><a href="#cb29-116" aria-hidden="true" tabindex="-1"></a>        SMALL_SIZE <span class="op">=</span> MEDIUM_SIZE <span class="op">=</span> BIGGER_SIZE <span class="op">=</span> size</span>
+<span id="cb29-117"><a href="#cb29-117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-118"><a href="#cb29-118" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"font"</span>, size<span class="op">=</span>SMALL_SIZE)  <span class="co"># controls default text sizes</span></span>
+<span id="cb29-119"><a href="#cb29-119" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"axes"</span>, titlesize<span class="op">=</span>SMALL_SIZE)  <span class="co"># fontsize of the axes title</span></span>
+<span id="cb29-120"><a href="#cb29-120" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"axes"</span>, labelsize<span class="op">=</span>MEDIUM_SIZE)  <span class="co"># fontsize of the x and y labels</span></span>
+<span id="cb29-121"><a href="#cb29-121" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"xtick"</span>, labelsize<span class="op">=</span>SMALL_SIZE)  <span class="co"># fontsize of the tick labels</span></span>
+<span id="cb29-122"><a href="#cb29-122" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"ytick"</span>, labelsize<span class="op">=</span>SMALL_SIZE)  <span class="co"># fontsize of the tick labels</span></span>
+<span id="cb29-123"><a href="#cb29-123" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"legend"</span>, fontsize<span class="op">=</span>SMALL_SIZE)  <span class="co"># legend fontsize</span></span>
+<span id="cb29-124"><a href="#cb29-124" aria-hidden="true" tabindex="-1"></a>    plt.rc(<span class="st">"figure"</span>, titlesize<span class="op">=</span>BIGGER_SIZE)  <span class="co"># fontsize of the figure title</span></span>
+<span id="cb29-125"><a href="#cb29-125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-126"><a href="#cb29-126" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-127"><a href="#cb29-127" aria-hidden="true" tabindex="-1"></a><span class="co"># Helper functions</span></span>
+<span id="cb29-128"><a href="#cb29-128" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> standard_units(x):</span>
+<span id="cb29-129"><a href="#cb29-129" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (x <span class="op">-</span> np.mean(x)) <span class="op">/</span> np.std(x)</span>
+<span id="cb29-130"><a href="#cb29-130" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-131"><a href="#cb29-131" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-132"><a href="#cb29-132" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> correlation(x, y):</span>
+<span id="cb29-133"><a href="#cb29-133" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(standard_units(x) <span class="op">*</span> standard_units(y))</span>
+<span id="cb29-134"><a href="#cb29-134" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-135"><a href="#cb29-135" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-136"><a href="#cb29-136" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> slope(x, y):</span>
+<span id="cb29-137"><a href="#cb29-137" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> correlation(x, y) <span class="op">*</span> np.std(y) <span class="op">/</span> np.std(x)</span>
+<span id="cb29-138"><a href="#cb29-138" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-139"><a href="#cb29-139" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-140"><a href="#cb29-140" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> intercept(x, y):</span>
+<span id="cb29-141"><a href="#cb29-141" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(y) <span class="op">-</span> slope(x, y) <span class="op">*</span> np.mean(x)</span>
+<span id="cb29-142"><a href="#cb29-142" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-143"><a href="#cb29-143" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-144"><a href="#cb29-144" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> fit_least_squares(x, y):</span>
+<span id="cb29-145"><a href="#cb29-145" aria-hidden="true" tabindex="-1"></a>    theta_0 <span class="op">=</span> intercept(x, y)</span>
+<span id="cb29-146"><a href="#cb29-146" aria-hidden="true" tabindex="-1"></a>    theta_1 <span class="op">=</span> slope(x, y)</span>
+<span id="cb29-147"><a href="#cb29-147" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> theta_0, theta_1</span>
+<span id="cb29-148"><a href="#cb29-148" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-149"><a href="#cb29-149" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-150"><a href="#cb29-150" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> predict(x, theta_0, theta_1):</span>
+<span id="cb29-151"><a href="#cb29-151" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> theta_0 <span class="op">+</span> theta_1 <span class="op">*</span> x</span>
+<span id="cb29-152"><a href="#cb29-152" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-153"><a href="#cb29-153" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-154"><a href="#cb29-154" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> compute_mse(y, yhat):</span>
+<span id="cb29-155"><a href="#cb29-155" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean((y <span class="op">-</span> yhat) <span class="op">**</span> <span class="dv">2</span>)</span>
+<span id="cb29-156"><a href="#cb29-156" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-157"><a href="#cb29-157" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-158"><a href="#cb29-158" aria-hidden="true" tabindex="-1"></a>plt.style.use(<span class="st">"default"</span>)  <span class="co"># Revert style to default mpl</span></span>
+<span id="cb29-159"><a href="#cb29-159" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-160"><a href="#cb29-160" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-163"><a href="#cb29-163" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-164"><a href="#cb29-164" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-165"><a href="#cb29-165" aria-hidden="true" tabindex="-1"></a>plt.style.use(<span class="st">"default"</span>)  <span class="co"># Revert style to default mpl</span></span>
+<span id="cb29-166"><a href="#cb29-166" aria-hidden="true" tabindex="-1"></a>NO_VIZ, RESID, RESID_SCATTER <span class="op">=</span> <span class="bu">range</span>(<span class="dv">3</span>)</span>
+<span id="cb29-167"><a href="#cb29-167" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-168"><a href="#cb29-168" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-169"><a href="#cb29-169" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> least_squares_evaluation(x, y, visualize<span class="op">=</span>NO_VIZ):</span>
+<span id="cb29-170"><a href="#cb29-170" aria-hidden="true" tabindex="-1"></a>    <span class="co"># statistics</span></span>
+<span id="cb29-171"><a href="#cb29-171" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"x_mean : </span><span class="sc">{</span>np<span class="sc">.</span>mean(x)<span class="sc">:.2f}</span><span class="ss">, y_mean : </span><span class="sc">{</span>np<span class="sc">.</span>mean(y)<span class="sc">:.2f}</span><span class="ss">"</span>)</span>
+<span id="cb29-172"><a href="#cb29-172" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"x_stdev: </span><span class="sc">{</span>np<span class="sc">.</span>std(x)<span class="sc">:.2f}</span><span class="ss">, y_stdev: </span><span class="sc">{</span>np<span class="sc">.</span>std(y)<span class="sc">:.2f}</span><span class="ss">"</span>)</span>
+<span id="cb29-173"><a href="#cb29-173" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"r = Correlation(x, y): </span><span class="sc">{</span>correlation(x, y)<span class="sc">:.3f}</span><span class="ss">"</span>)</span>
+<span id="cb29-174"><a href="#cb29-174" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-175"><a href="#cb29-175" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Performance metrics</span></span>
+<span id="cb29-176"><a href="#cb29-176" aria-hidden="true" tabindex="-1"></a>    ahat, bhat <span class="op">=</span> fit_least_squares(x, y)</span>
+<span id="cb29-177"><a href="#cb29-177" aria-hidden="true" tabindex="-1"></a>    yhat <span class="op">=</span> predict(x, ahat, bhat)</span>
+<span id="cb29-178"><a href="#cb29-178" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"</span><span class="ch">\t</span><span class="ss">heta_0: </span><span class="sc">{</span>ahat<span class="sc">:.2f}</span><span class="ss">, </span><span class="ch">\t</span><span class="ss">heta_1: </span><span class="sc">{</span>bhat<span class="sc">:.2f}</span><span class="ss">"</span>)</span>
+<span id="cb29-179"><a href="#cb29-179" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"RMSE: </span><span class="sc">{</span>np<span class="sc">.</span>sqrt(compute_mse(y, yhat))<span class="sc">:.3f}</span><span class="ss">"</span>)</span>
+<span id="cb29-180"><a href="#cb29-180" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-181"><a href="#cb29-181" aria-hidden="true" tabindex="-1"></a>    <span class="co"># visualization</span></span>
+<span id="cb29-182"><a href="#cb29-182" aria-hidden="true" tabindex="-1"></a>    fig, ax_resid <span class="op">=</span> <span class="va">None</span>, <span class="va">None</span></span>
+<span id="cb29-183"><a href="#cb29-183" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> visualize <span class="op">==</span> RESID_SCATTER:</span>
+<span id="cb29-184"><a href="#cb29-184" aria-hidden="true" tabindex="-1"></a>        fig, axs <span class="op">=</span> plt.subplots(<span class="dv">1</span>, <span class="dv">2</span>, figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="dv">3</span>))</span>
+<span id="cb29-185"><a href="#cb29-185" aria-hidden="true" tabindex="-1"></a>        axs[<span class="dv">0</span>].scatter(x, y)</span>
+<span id="cb29-186"><a href="#cb29-186" aria-hidden="true" tabindex="-1"></a>        axs[<span class="dv">0</span>].plot(x, yhat)</span>
+<span id="cb29-187"><a href="#cb29-187" aria-hidden="true" tabindex="-1"></a>        axs[<span class="dv">0</span>].set_title(<span class="st">"LS fit"</span>)</span>
+<span id="cb29-188"><a href="#cb29-188" aria-hidden="true" tabindex="-1"></a>        ax_resid <span class="op">=</span> axs[<span class="dv">1</span>]</span>
+<span id="cb29-189"><a href="#cb29-189" aria-hidden="true" tabindex="-1"></a>    <span class="cf">elif</span> visualize <span class="op">==</span> RESID:</span>
+<span id="cb29-190"><a href="#cb29-190" aria-hidden="true" tabindex="-1"></a>        fig <span class="op">=</span> plt.figure(figsize<span class="op">=</span>(<span class="dv">4</span>, <span class="dv">3</span>))</span>
+<span id="cb29-191"><a href="#cb29-191" aria-hidden="true" tabindex="-1"></a>        ax_resid <span class="op">=</span> plt.gca()</span>
+<span id="cb29-192"><a href="#cb29-192" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-193"><a href="#cb29-193" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> ax_resid <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
+<span id="cb29-194"><a href="#cb29-194" aria-hidden="true" tabindex="-1"></a>        ax_resid.scatter(x, y <span class="op">-</span> yhat, color<span class="op">=</span><span class="st">"red"</span>)</span>
+<span id="cb29-195"><a href="#cb29-195" aria-hidden="true" tabindex="-1"></a>        ax_resid.plot([<span class="dv">4</span>, <span class="dv">14</span>], [<span class="dv">0</span>, <span class="dv">0</span>], color<span class="op">=</span><span class="st">"black"</span>)</span>
+<span id="cb29-196"><a href="#cb29-196" aria-hidden="true" tabindex="-1"></a>        ax_resid.set_title(<span class="st">"Residuals"</span>)</span>
+<span id="cb29-197"><a href="#cb29-197" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-198"><a href="#cb29-198" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> fig</span>
+<span id="cb29-199"><a href="#cb29-199" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-200"><a href="#cb29-200" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-203"><a href="#cb29-203" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-204"><a href="#cb29-204" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-205"><a href="#cb29-205" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-206"><a href="#cb29-206" aria-hidden="true" tabindex="-1"></a><span class="co"># Load in four different datasets: I, II, III, IV</span></span>
+<span id="cb29-207"><a href="#cb29-207" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> [<span class="dv">10</span>, <span class="dv">8</span>, <span class="dv">13</span>, <span class="dv">9</span>, <span class="dv">11</span>, <span class="dv">14</span>, <span class="dv">6</span>, <span class="dv">4</span>, <span class="dv">12</span>, <span class="dv">7</span>, <span class="dv">5</span>]</span>
+<span id="cb29-208"><a href="#cb29-208" aria-hidden="true" tabindex="-1"></a>y1 <span class="op">=</span> [<span class="fl">8.04</span>, <span class="fl">6.95</span>, <span class="fl">7.58</span>, <span class="fl">8.81</span>, <span class="fl">8.33</span>, <span class="fl">9.96</span>, <span class="fl">7.24</span>, <span class="fl">4.26</span>, <span class="fl">10.84</span>, <span class="fl">4.82</span>, <span class="fl">5.68</span>]</span>
+<span id="cb29-209"><a href="#cb29-209" aria-hidden="true" tabindex="-1"></a>y2 <span class="op">=</span> [<span class="fl">9.14</span>, <span class="fl">8.14</span>, <span class="fl">8.74</span>, <span class="fl">8.77</span>, <span class="fl">9.26</span>, <span class="fl">8.10</span>, <span class="fl">6.13</span>, <span class="fl">3.10</span>, <span class="fl">9.13</span>, <span class="fl">7.26</span>, <span class="fl">4.74</span>]</span>
+<span id="cb29-210"><a href="#cb29-210" aria-hidden="true" tabindex="-1"></a>y3 <span class="op">=</span> [<span class="fl">7.46</span>, <span class="fl">6.77</span>, <span class="fl">12.74</span>, <span class="fl">7.11</span>, <span class="fl">7.81</span>, <span class="fl">8.84</span>, <span class="fl">6.08</span>, <span class="fl">5.39</span>, <span class="fl">8.15</span>, <span class="fl">6.42</span>, <span class="fl">5.73</span>]</span>
+<span id="cb29-211"><a href="#cb29-211" aria-hidden="true" tabindex="-1"></a>x4 <span class="op">=</span> [<span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">19</span>, <span class="dv">8</span>, <span class="dv">8</span>, <span class="dv">8</span>]</span>
+<span id="cb29-212"><a href="#cb29-212" aria-hidden="true" tabindex="-1"></a>y4 <span class="op">=</span> [<span class="fl">6.58</span>, <span class="fl">5.76</span>, <span class="fl">7.71</span>, <span class="fl">8.84</span>, <span class="fl">8.47</span>, <span class="fl">7.04</span>, <span class="fl">5.25</span>, <span class="fl">12.50</span>, <span class="fl">5.56</span>, <span class="fl">7.91</span>, <span class="fl">6.89</span>]</span>
+<span id="cb29-213"><a href="#cb29-213" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-214"><a href="#cb29-214" aria-hidden="true" tabindex="-1"></a>anscombe <span class="op">=</span> {</span>
+<span id="cb29-215"><a href="#cb29-215" aria-hidden="true" tabindex="-1"></a>    <span class="st">"I"</span>: pd.DataFrame(<span class="bu">list</span>(<span class="bu">zip</span>(x, y1)), columns<span class="op">=</span>[<span class="st">"x"</span>, <span class="st">"y"</span>]),</span>
+<span id="cb29-216"><a href="#cb29-216" aria-hidden="true" tabindex="-1"></a>    <span class="st">"II"</span>: pd.DataFrame(<span class="bu">list</span>(<span class="bu">zip</span>(x, y2)), columns<span class="op">=</span>[<span class="st">"x"</span>, <span class="st">"y"</span>]),</span>
+<span id="cb29-217"><a href="#cb29-217" aria-hidden="true" tabindex="-1"></a>    <span class="st">"III"</span>: pd.DataFrame(<span class="bu">list</span>(<span class="bu">zip</span>(x, y3)), columns<span class="op">=</span>[<span class="st">"x"</span>, <span class="st">"y"</span>]),</span>
+<span id="cb29-218"><a href="#cb29-218" aria-hidden="true" tabindex="-1"></a>    <span class="st">"IV"</span>: pd.DataFrame(<span class="bu">list</span>(<span class="bu">zip</span>(x4, y4)), columns<span class="op">=</span>[<span class="st">"x"</span>, <span class="st">"y"</span>]),</span>
+<span id="cb29-219"><a href="#cb29-219" aria-hidden="true" tabindex="-1"></a>}</span>
+<span id="cb29-220"><a href="#cb29-220" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-221"><a href="#cb29-221" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the scatter plot and line of best fit</span></span>
+<span id="cb29-222"><a href="#cb29-222" aria-hidden="true" tabindex="-1"></a>fig, axs <span class="op">=</span> plt.subplots(<span class="dv">2</span>, <span class="dv">2</span>, figsize<span class="op">=</span>(<span class="dv">10</span>, <span class="dv">10</span>))</span>
+<span id="cb29-223"><a href="#cb29-223" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-224"><a href="#cb29-224" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i, dataset <span class="kw">in</span> <span class="bu">enumerate</span>([<span class="st">"I"</span>, <span class="st">"II"</span>, <span class="st">"III"</span>, <span class="st">"IV"</span>]):</span>
+<span id="cb29-225"><a href="#cb29-225" aria-hidden="true" tabindex="-1"></a>    ans <span class="op">=</span> anscombe[dataset]</span>
+<span id="cb29-226"><a href="#cb29-226" aria-hidden="true" tabindex="-1"></a>    x, y <span class="op">=</span> ans[<span class="st">"x"</span>], ans[<span class="st">"y"</span>]</span>
+<span id="cb29-227"><a href="#cb29-227" aria-hidden="true" tabindex="-1"></a>    ahat, bhat <span class="op">=</span> fit_least_squares(x, y)</span>
+<span id="cb29-228"><a href="#cb29-228" aria-hidden="true" tabindex="-1"></a>    yhat <span class="op">=</span> predict(x, ahat, bhat)</span>
+<span id="cb29-229"><a href="#cb29-229" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].scatter(x, y, alpha<span class="op">=</span><span class="fl">0.6</span>, color<span class="op">=</span><span class="st">"red"</span>)  <span class="co"># plot the x, y points</span></span>
+<span id="cb29-230"><a href="#cb29-230" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].plot(x, yhat)  <span class="co"># plot the line of best fit</span></span>
+<span id="cb29-231"><a href="#cb29-231" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_xlabel(<span class="ss">f"$x_</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">$"</span>)</span>
+<span id="cb29-232"><a href="#cb29-232" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_ylabel(<span class="ss">f"$y_</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">$"</span>)</span>
+<span id="cb29-233"><a href="#cb29-233" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_title(<span class="ss">f"Dataset </span><span class="sc">{</span>dataset<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb29-234"><a href="#cb29-234" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-235"><a href="#cb29-235" aria-hidden="true" tabindex="-1"></a>plt.show()</span>
+<span id="cb29-236"><a href="#cb29-236" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-237"><a href="#cb29-237" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-238"><a href="#cb29-238" aria-hidden="true" tabindex="-1"></a>While these four sets of datapoints look very different, they actually all have identical means $\bar x$, $\bar y$, standard deviations $\sigma_x$, $\sigma_y$, correlation $r$, and RMSE! If we only look at these statistics, we would probably be inclined to say that these datasets are similar.</span>
+<span id="cb29-239"><a href="#cb29-239" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-242"><a href="#cb29-242" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-243"><a href="#cb29-243" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-244"><a href="#cb29-244" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-245"><a href="#cb29-245" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dataset <span class="kw">in</span> [<span class="st">"I"</span>, <span class="st">"II"</span>, <span class="st">"III"</span>, <span class="st">"IV"</span>]:</span>
+<span id="cb29-246"><a href="#cb29-246" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"&gt;&gt;&gt; Dataset </span><span class="sc">{</span>dataset<span class="sc">}</span><span class="ss">:"</span>)</span>
+<span id="cb29-247"><a href="#cb29-247" aria-hidden="true" tabindex="-1"></a>    ans <span class="op">=</span> anscombe[dataset]</span>
+<span id="cb29-248"><a href="#cb29-248" aria-hidden="true" tabindex="-1"></a>    fig <span class="op">=</span> least_squares_evaluation(ans[<span class="st">"x"</span>], ans[<span class="st">"y"</span>], visualize<span class="op">=</span>NO_VIZ)</span>
+<span id="cb29-249"><a href="#cb29-249" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>()</span>
+<span id="cb29-250"><a href="#cb29-250" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>()</span>
+<span id="cb29-251"><a href="#cb29-251" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-252"><a href="#cb29-252" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-253"><a href="#cb29-253" aria-hidden="true" tabindex="-1"></a>We may also wish to visualize the model's **residuals**, defined as the difference between the observed and predicted $y_i$ value ($e_i = y_i - \hat{y}_i$). This gives a high-level view of how "off" each prediction is from the true observed value. Recall that you explored this concept in <span class="co">[</span><span class="ot">Data 8</span><span class="co">](https://inferentialthinking.com/chapters/15/5/Visual_Diagnostics.html?highlight=heteroscedasticity#detecting-heteroscedasticity)</span>: a good regression fit should display no clear pattern in its plot of residuals. The residual plots for Anscombe's quartet are displayed below. Note how only the first plot shows no clear pattern to the magnitude of residuals. This is an indication that SLR is not the best choice of model for the remaining three sets of points.</span>
+<span id="cb29-254"><a href="#cb29-254" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-255"><a href="#cb29-255" aria-hidden="true" tabindex="-1"></a><span class="co">&lt;!-- &lt;img src="images/residual.png" alt='residual' width='600'&gt; --&gt;</span></span>
+<span id="cb29-256"><a href="#cb29-256" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-259"><a href="#cb29-259" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-260"><a href="#cb29-260" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-261"><a href="#cb29-261" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-262"><a href="#cb29-262" aria-hidden="true" tabindex="-1"></a><span class="co"># Residual visualization</span></span>
+<span id="cb29-263"><a href="#cb29-263" aria-hidden="true" tabindex="-1"></a>fig, axs <span class="op">=</span> plt.subplots(<span class="dv">2</span>, <span class="dv">2</span>, figsize<span class="op">=</span>(<span class="dv">10</span>, <span class="dv">10</span>))</span>
+<span id="cb29-264"><a href="#cb29-264" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-265"><a href="#cb29-265" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i, dataset <span class="kw">in</span> <span class="bu">enumerate</span>([<span class="st">"I"</span>, <span class="st">"II"</span>, <span class="st">"III"</span>, <span class="st">"IV"</span>]):</span>
+<span id="cb29-266"><a href="#cb29-266" aria-hidden="true" tabindex="-1"></a>    ans <span class="op">=</span> anscombe[dataset]</span>
+<span id="cb29-267"><a href="#cb29-267" aria-hidden="true" tabindex="-1"></a>    x, y <span class="op">=</span> ans[<span class="st">"x"</span>], ans[<span class="st">"y"</span>]</span>
+<span id="cb29-268"><a href="#cb29-268" aria-hidden="true" tabindex="-1"></a>    ahat, bhat <span class="op">=</span> fit_least_squares(x, y)</span>
+<span id="cb29-269"><a href="#cb29-269" aria-hidden="true" tabindex="-1"></a>    yhat <span class="op">=</span> predict(x, ahat, bhat)</span>
+<span id="cb29-270"><a href="#cb29-270" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].scatter(</span>
+<span id="cb29-271"><a href="#cb29-271" aria-hidden="true" tabindex="-1"></a>        x, y <span class="op">-</span> yhat, alpha<span class="op">=</span><span class="fl">0.6</span>, color<span class="op">=</span><span class="st">"red"</span></span>
+<span id="cb29-272"><a href="#cb29-272" aria-hidden="true" tabindex="-1"></a>    )  <span class="co"># plot the x, y points</span></span>
+<span id="cb29-273"><a href="#cb29-273" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].plot(</span>
+<span id="cb29-274"><a href="#cb29-274" aria-hidden="true" tabindex="-1"></a>        x, np.zeros_like(x), color<span class="op">=</span><span class="st">"black"</span></span>
+<span id="cb29-275"><a href="#cb29-275" aria-hidden="true" tabindex="-1"></a>    )  <span class="co"># plot the residual line</span></span>
+<span id="cb29-276"><a href="#cb29-276" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_xlabel(<span class="ss">f"$x_</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">$"</span>)</span>
+<span id="cb29-277"><a href="#cb29-277" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_ylabel(<span class="ss">f"$e_</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">$"</span>)</span>
+<span id="cb29-278"><a href="#cb29-278" aria-hidden="true" tabindex="-1"></a>    axs[i <span class="op">//</span> <span class="dv">2</span>, i <span class="op">%</span> <span class="dv">2</span>].set_title(<span class="ss">f"Dataset </span><span class="sc">{</span>dataset<span class="sc">}</span><span class="ss"> Residuals"</span>)</span>
+<span id="cb29-279"><a href="#cb29-279" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-280"><a href="#cb29-280" aria-hidden="true" tabindex="-1"></a>plt.show()</span>
+<span id="cb29-281"><a href="#cb29-281" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-282"><a href="#cb29-282" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-283"><a href="#cb29-283" aria-hidden="true" tabindex="-1"></a><span class="fu">## Constant Model + MSE</span></span>
+<span id="cb29-284"><a href="#cb29-284" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-285"><a href="#cb29-285" aria-hidden="true" tabindex="-1"></a>Now, we'll shift from the SLR model to the **constant model**, also known as a summary statistic. The constant model is slightly different from the simple linear regression model we've explored previously. Rather than generating predictions from an inputted feature variable, the constant model always *predicts the same constant number*. This ignores any relationships between variables. For example, let's say we want to predict the number of drinks a boba shop sells in a day. Boba tea sales likely depend on the time of year, the weather, how the customers feel, whether school is in session, etc., but the constant model ignores these factors in favor of a simpler model. In other words, the constant model employs a **simplifying assumption**.</span>
+<span id="cb29-286"><a href="#cb29-286" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-287"><a href="#cb29-287" aria-hidden="true" tabindex="-1"></a>It is also a parametric, statistical model:</span>
+<span id="cb29-288"><a href="#cb29-288" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-289"><a href="#cb29-289" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0$$</span>
+<span id="cb29-290"><a href="#cb29-290" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-291"><a href="#cb29-291" aria-hidden="true" tabindex="-1"></a>$\theta_0$ is the parameter of the constant model, just as $\theta_0$ and $\theta_1$ were the parameters in SLR.</span>
+<span id="cb29-292"><a href="#cb29-292" aria-hidden="true" tabindex="-1"></a>Since our parameter $\theta_0$ is 1-dimensional ($\theta_0 \in \mathbb{R}$), we now have no input to our model and will always predict $\hat{y} = \theta_0$.</span>
+<span id="cb29-293"><a href="#cb29-293" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-294"><a href="#cb29-294" aria-hidden="true" tabindex="-1"></a><span class="fu">### Deriving the optimal $\theta_0$</span></span>
+<span id="cb29-295"><a href="#cb29-295" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-296"><a href="#cb29-296" aria-hidden="true" tabindex="-1"></a>Our task now is to determine what value of $\theta_0$ best represents the optimal model – in other words, what number should we guess each time to have the lowest possible **average loss** on our data?</span>
+<span id="cb29-297"><a href="#cb29-297" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-298"><a href="#cb29-298" aria-hidden="true" tabindex="-1"></a>Like before, we'll use Mean Squared Error (MSE). Recall that the MSE is average squared loss (L2 loss) over the data $D = <span class="sc">\{</span>y_1, y_2, ..., y_n<span class="sc">\}</span>$.</span>
+<span id="cb29-299"><a href="#cb29-299" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-300"><a href="#cb29-300" aria-hidden="true" tabindex="-1"></a>$$\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \hat{y_i})^2 $$</span>
+<span id="cb29-301"><a href="#cb29-301" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-302"><a href="#cb29-302" aria-hidden="true" tabindex="-1"></a>Our modeling process now looks like this:</span>
+<span id="cb29-303"><a href="#cb29-303" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-304"><a href="#cb29-304" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Choose a model: constant model</span>
+<span id="cb29-305"><a href="#cb29-305" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Choose a loss function: L2 loss</span>
+<span id="cb29-306"><a href="#cb29-306" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Fit the model</span>
+<span id="cb29-307"><a href="#cb29-307" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>Evaluate model performance</span>
+<span id="cb29-308"><a href="#cb29-308" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-309"><a href="#cb29-309" aria-hidden="true" tabindex="-1"></a>Given the **constant model** $\hat{y} = \theta_0$, we can rewrite the MSE equation as</span>
+<span id="cb29-310"><a href="#cb29-310" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-311"><a href="#cb29-311" aria-hidden="true" tabindex="-1"></a>$$\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2 $$</span>
+<span id="cb29-312"><a href="#cb29-312" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-313"><a href="#cb29-313" aria-hidden="true" tabindex="-1"></a>We can fit **the model** by finding the optimal $\hat{\theta_0}$ that minimizes the MSE using a calculus approach.</span>
+<span id="cb29-314"><a href="#cb29-314" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-315"><a href="#cb29-315" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Differentiate with respect to $\theta_0$:</span>
+<span id="cb29-316"><a href="#cb29-316" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-317"><a href="#cb29-317" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-318"><a href="#cb29-318" aria-hidden="true" tabindex="-1"></a>\begin{align}</span>
+<span id="cb29-319"><a href="#cb29-319" aria-hidden="true" tabindex="-1"></a>\frac{d}{d\theta_0}\text{R}(\theta) &amp; = \frac{d}{d\theta_0}(\frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2)</span>
+<span id="cb29-320"><a href="#cb29-320" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \frac{1}{n}\sum^{n}_{i=1} \frac{d}{d\theta_0}  (y_i - \theta_0)^2 \quad \quad \text{a derivative of sums is a sum of derivatives}</span>
+<span id="cb29-321"><a href="#cb29-321" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \frac{1}{n}\sum^{n}_{i=1} 2 (y_i - \theta_0) (-1) \quad \quad \text{chain rule}</span>
+<span id="cb29-322"><a href="#cb29-322" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= {\frac{-2}{n}}\sum^{n}_{i=1} (y_i - \theta_0) \quad \quad \text{simply constants}</span>
+<span id="cb29-323"><a href="#cb29-323" aria-hidden="true" tabindex="-1"></a>\end{align}</span>
+<span id="cb29-324"><a href="#cb29-324" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-325"><a href="#cb29-325" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-326"><a href="#cb29-326" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Set the derivative equation equal to 0:</span>
+<span id="cb29-327"><a href="#cb29-327" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-328"><a href="#cb29-328" aria-hidden="true" tabindex="-1"></a>   $$</span>
+<span id="cb29-329"><a href="#cb29-329" aria-hidden="true" tabindex="-1"></a>   0 = {\frac{-2}{n}}\sum^{n}_{i=1} (y_i - \hat{\theta_0})</span>
+<span id="cb29-330"><a href="#cb29-330" aria-hidden="true" tabindex="-1"></a>   $$</span>
+<span id="cb29-331"><a href="#cb29-331" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-332"><a href="#cb29-332" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Solve for $\hat{\theta_0}$</span>
+<span id="cb29-333"><a href="#cb29-333" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-334"><a href="#cb29-334" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-335"><a href="#cb29-335" aria-hidden="true" tabindex="-1"></a>\begin{align}</span>
+<span id="cb29-336"><a href="#cb29-336" aria-hidden="true" tabindex="-1"></a>0 &amp;= {\frac{-2}{n}}\sum^{n}_{i=1} (y_i - \hat{\theta_0})</span>
+<span id="cb29-337"><a href="#cb29-337" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \sum^{n}_{i=1} (y_i - \hat{\theta_0}) \quad \quad \text{divide both sides by} \frac{-2}{n}</span>
+<span id="cb29-338"><a href="#cb29-338" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \left(\sum^{n}_{i=1} y_i\right) - \left(\sum^{n}_{i=1} \theta_0\right) \quad \quad \text{separate sums}</span>
+<span id="cb29-339"><a href="#cb29-339" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \left(\sum^{n}_{i=1} y_i\right) - (n \cdot \hat{\theta_0}) \quad \quad  \text{c + c + … + c = nc}</span>
+<span id="cb29-340"><a href="#cb29-340" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> n \cdot \hat{\theta_0} &amp;= \sum^{n}_{i=1} y_i</span>
+<span id="cb29-341"><a href="#cb29-341" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> \hat{\theta_0} &amp;= \frac{1}{n} \sum^{n}_{i=1} y_i</span>
+<span id="cb29-342"><a href="#cb29-342" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> \hat{\theta_0} &amp;= \bar{y}</span>
+<span id="cb29-343"><a href="#cb29-343" aria-hidden="true" tabindex="-1"></a>\end{align}</span>
+<span id="cb29-344"><a href="#cb29-344" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-345"><a href="#cb29-345" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-346"><a href="#cb29-346" aria-hidden="true" tabindex="-1"></a>Let's take a moment to interpret this result. $\hat{\theta_0} = \bar{y}$ is the optimal parameter for constant model + MSE.</span>
+<span id="cb29-347"><a href="#cb29-347" aria-hidden="true" tabindex="-1"></a>It holds true regardless of what data sample you have, and it provides some formal reasoning as to why the mean is such a common summary statistic.</span>
+<span id="cb29-348"><a href="#cb29-348" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-349"><a href="#cb29-349" aria-hidden="true" tabindex="-1"></a>Our optimal model parameter is the value of the parameter that minimizes the cost function. This minimum value of the cost function can be expressed:</span>
+<span id="cb29-350"><a href="#cb29-350" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-351"><a href="#cb29-351" aria-hidden="true" tabindex="-1"></a>$$R(\hat{\theta_0}) = \min_{\theta_0} R(\theta_0)$$</span>
+<span id="cb29-352"><a href="#cb29-352" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-353"><a href="#cb29-353" aria-hidden="true" tabindex="-1"></a>To restate the above in plain English: we are looking at the value of the cost function when it takes the best parameter as input. This optimal model parameter, $\hat{\theta_0}$, is the value of $\theta_0$ that minimizes the cost $R$.</span>
+<span id="cb29-354"><a href="#cb29-354" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-355"><a href="#cb29-355" aria-hidden="true" tabindex="-1"></a>For modeling purposes, we care less about the minimum value of cost, $R(\hat{\theta_0})$, and more about the *value of $\theta$* that results in this lowest average loss. In other words, we concern ourselves with finding the best parameter value such that:</span>
+<span id="cb29-356"><a href="#cb29-356" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-357"><a href="#cb29-357" aria-hidden="true" tabindex="-1"></a>$$\hat{\theta} = \underset{\theta}{\operatorname{\arg\min}}\:R(\theta)$$</span>
+<span id="cb29-358"><a href="#cb29-358" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-359"><a href="#cb29-359" aria-hidden="true" tabindex="-1"></a>That is, we want to find the **arg**ument $\theta$ that **min**imizes the cost function.</span>
+<span id="cb29-360"><a href="#cb29-360" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-361"><a href="#cb29-361" aria-hidden="true" tabindex="-1"></a><span class="fu">### Comparing Two Different Models, Both Fit with MSE</span></span>
+<span id="cb29-362"><a href="#cb29-362" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-363"><a href="#cb29-363" aria-hidden="true" tabindex="-1"></a>Now that we've explored the constant model with an L2 loss, we can compare it to the SLR model that we learned last lecture. Consider the dataset below, which contains information about the ages and lengths of dugongs. Supposed we wanted to predict dugong ages:</span>
+<span id="cb29-364"><a href="#cb29-364" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-365"><a href="#cb29-365" aria-hidden="true" tabindex="-1"></a>|                        | Constant Model                                                   | Simple Linear Regression                                                        |</span>
+<span id="cb29-366"><a href="#cb29-366" aria-hidden="true" tabindex="-1"></a>| ---------------------- | ---------------------------------------------------------------- | ------------------------------------------------------------------------------- |</span>
+<span id="cb29-367"><a href="#cb29-367" aria-hidden="true" tabindex="-1"></a>| model                  | $\hat{y} = \theta_0$                                             | $\hat{y} = \theta_0 + \theta_1 x$                                                |</span>
+<span id="cb29-368"><a href="#cb29-368" aria-hidden="true" tabindex="-1"></a>| data                   | sample of ages $D = <span class="sc">\{</span>y_1, y_2, ..., y_n<span class="sc">\}</span>$                      | sample of ages $D = <span class="sc">\{</span>(x_1, y_1), (x_2, y_2), ..., (x_n, y_n)<span class="sc">\}</span>$                |</span>
+<span id="cb29-369"><a href="#cb29-369" aria-hidden="true" tabindex="-1"></a>| dimensions             | $\hat{\theta_0}$ is 1-D                                          | $\hat{\theta} = <span class="co">[</span><span class="ot">\hat{\theta_0}, \hat{\theta_1}</span><span class="co">]</span>$ is 2-D                        |</span>
+<span id="cb29-370"><a href="#cb29-370" aria-hidden="true" tabindex="-1"></a>| loss surface           | 2-D <span class="al">![](images/constant_loss_surface.png)</span>                        | 3-D <span class="al">![](images/slr_loss_surface.png)</span>                                            |</span>
+<span id="cb29-371"><a href="#cb29-371" aria-hidden="true" tabindex="-1"></a>| loss model             | $\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2$ | $\hat{R}(\theta_0, \theta_1) = \frac{1}{n}\sum^{n}_{i=1} (y_i - (\theta_0 + \theta_1 x))^2$ |</span>
+<span id="cb29-372"><a href="#cb29-372" aria-hidden="true" tabindex="-1"></a>| RMSE                   | 7.72                                                             | 4.31                                                                            |</span>
+<span id="cb29-373"><a href="#cb29-373" aria-hidden="true" tabindex="-1"></a>| predictions visualized | rug plot <span class="al">![](images/dugong_rug.png)</span>                              | scatter plot <span class="al">![](images/dugong_scatter.png)</span>                                     |</span>
+<span id="cb29-374"><a href="#cb29-374" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-375"><a href="#cb29-375" aria-hidden="true" tabindex="-1"></a>(Notice how the points for our SLR scatter plot are visually not a great linear fit. We'll come back to this).</span>
+<span id="cb29-376"><a href="#cb29-376" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-377"><a href="#cb29-377" aria-hidden="true" tabindex="-1"></a>The code for generating the graphs and models is included below, but we won't go over it in too much depth.</span>
+<span id="cb29-378"><a href="#cb29-378" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-381"><a href="#cb29-381" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-382"><a href="#cb29-382" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-383"><a href="#cb29-383" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-384"><a href="#cb29-384" aria-hidden="true" tabindex="-1"></a>dugongs <span class="op">=</span> pd.read_csv(<span class="st">"data/dugongs.csv"</span>)</span>
+<span id="cb29-385"><a href="#cb29-385" aria-hidden="true" tabindex="-1"></a>data_constant <span class="op">=</span> dugongs[<span class="st">"Age"</span>]</span>
+<span id="cb29-386"><a href="#cb29-386" aria-hidden="true" tabindex="-1"></a>data_linear <span class="op">=</span> dugongs[[<span class="st">"Length"</span>, <span class="st">"Age"</span>]]</span>
+<span id="cb29-387"><a href="#cb29-387" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-388"><a href="#cb29-388" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-391"><a href="#cb29-391" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-392"><a href="#cb29-392" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-393"><a href="#cb29-393" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-394"><a href="#cb29-394" aria-hidden="true" tabindex="-1"></a><span class="co"># Constant Model + MSE</span></span>
+<span id="cb29-395"><a href="#cb29-395" aria-hidden="true" tabindex="-1"></a>plt.style.use(<span class="st">'default'</span>) <span class="co"># Revert style to default mpl</span></span>
+<span id="cb29-396"><a href="#cb29-396" aria-hidden="true" tabindex="-1"></a>adjust_fontsize(size<span class="op">=</span><span class="dv">16</span>)</span>
+<span id="cb29-397"><a href="#cb29-397" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
+<span id="cb29-398"><a href="#cb29-398" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-399"><a href="#cb29-399" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_constant(theta, data):</span>
+<span id="cb29-400"><a href="#cb29-400" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(np.array([(y_obs <span class="op">-</span> theta) <span class="op">**</span> <span class="dv">2</span> <span class="cf">for</span> y_obs <span class="kw">in</span> data]), axis<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb29-401"><a href="#cb29-401" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-402"><a href="#cb29-402" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">20</span>, <span class="dv">42</span>, <span class="dv">1000</span>)</span>
+<span id="cb29-403"><a href="#cb29-403" aria-hidden="true" tabindex="-1"></a>l2_loss_thetas <span class="op">=</span> mse_constant(thetas, data_constant)</span>
+<span id="cb29-404"><a href="#cb29-404" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-405"><a href="#cb29-405" aria-hidden="true" tabindex="-1"></a><span class="co"># Plotting the loss surface</span></span>
+<span id="cb29-406"><a href="#cb29-406" aria-hidden="true" tabindex="-1"></a>plt.plot(thetas, l2_loss_thetas)</span>
+<span id="cb29-407"><a href="#cb29-407" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r'$\theta_0$'</span>)</span>
+<span id="cb29-408"><a href="#cb29-408" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="vs">r'MSE'</span>)</span>
+<span id="cb29-409"><a href="#cb29-409" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-410"><a href="#cb29-410" aria-hidden="true" tabindex="-1"></a><span class="co"># Optimal point</span></span>
+<span id="cb29-411"><a href="#cb29-411" aria-hidden="true" tabindex="-1"></a>thetahat <span class="op">=</span> np.mean(data_constant)</span>
+<span id="cb29-412"><a href="#cb29-412" aria-hidden="true" tabindex="-1"></a>plt.scatter([thetahat], [mse_constant(thetahat, data_constant)], s<span class="op">=</span><span class="dv">50</span>, label <span class="op">=</span> <span class="vs">r"$\hat{\theta}_0$"</span>)</span>
+<span id="cb29-413"><a href="#cb29-413" aria-hidden="true" tabindex="-1"></a>plt.legend()<span class="op">;</span></span>
+<span id="cb29-414"><a href="#cb29-414" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.show()</span></span>
+<span id="cb29-415"><a href="#cb29-415" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-416"><a href="#cb29-416" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-419"><a href="#cb29-419" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-420"><a href="#cb29-420" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-421"><a href="#cb29-421" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-422"><a href="#cb29-422" aria-hidden="true" tabindex="-1"></a><span class="co"># SLR + MSE</span></span>
+<span id="cb29-423"><a href="#cb29-423" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_linear(theta_0, theta_1, data_linear):</span>
+<span id="cb29-424"><a href="#cb29-424" aria-hidden="true" tabindex="-1"></a>    data_x, data_y <span class="op">=</span> data_linear.iloc[:, <span class="dv">0</span>], data_linear.iloc[:, <span class="dv">1</span>]</span>
+<span id="cb29-425"><a href="#cb29-425" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(</span>
+<span id="cb29-426"><a href="#cb29-426" aria-hidden="true" tabindex="-1"></a>        np.array([(y <span class="op">-</span> (theta_0 <span class="op">+</span> theta_1 <span class="op">*</span> x)) <span class="op">**</span> <span class="dv">2</span> <span class="cf">for</span> x, y <span class="kw">in</span> <span class="bu">zip</span>(data_x, data_y)]),</span>
+<span id="cb29-427"><a href="#cb29-427" aria-hidden="true" tabindex="-1"></a>        axis<span class="op">=</span><span class="dv">0</span>,</span>
+<span id="cb29-428"><a href="#cb29-428" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb29-429"><a href="#cb29-429" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-430"><a href="#cb29-430" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-431"><a href="#cb29-431" aria-hidden="true" tabindex="-1"></a><span class="co"># plotting the loss surface</span></span>
+<span id="cb29-432"><a href="#cb29-432" aria-hidden="true" tabindex="-1"></a>theta_0_values <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">80</span>, <span class="dv">20</span>, <span class="dv">80</span>)</span>
+<span id="cb29-433"><a href="#cb29-433" aria-hidden="true" tabindex="-1"></a>theta_1_values <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">10</span>, <span class="dv">30</span>, <span class="dv">80</span>)</span>
+<span id="cb29-434"><a href="#cb29-434" aria-hidden="true" tabindex="-1"></a>mse_values <span class="op">=</span> np.array(</span>
+<span id="cb29-435"><a href="#cb29-435" aria-hidden="true" tabindex="-1"></a>    [[mse_linear(x, y, data_linear) <span class="cf">for</span> x <span class="kw">in</span> theta_0_values] <span class="cf">for</span> y <span class="kw">in</span> theta_1_values]</span>
+<span id="cb29-436"><a href="#cb29-436" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb29-437"><a href="#cb29-437" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-438"><a href="#cb29-438" aria-hidden="true" tabindex="-1"></a><span class="co"># Optimal point</span></span>
+<span id="cb29-439"><a href="#cb29-439" aria-hidden="true" tabindex="-1"></a>data_x, data_y <span class="op">=</span> data_linear.iloc[:, <span class="dv">0</span>], data_linear.iloc[:, <span class="dv">1</span>]</span>
+<span id="cb29-440"><a href="#cb29-440" aria-hidden="true" tabindex="-1"></a>theta_1_hat <span class="op">=</span> np.corrcoef(data_x, data_y)[<span class="dv">0</span>, <span class="dv">1</span>] <span class="op">*</span> np.std(data_y) <span class="op">/</span> np.std(data_x)</span>
+<span id="cb29-441"><a href="#cb29-441" aria-hidden="true" tabindex="-1"></a>theta_0_hat <span class="op">=</span> np.mean(data_y) <span class="op">-</span> theta_1_hat <span class="op">*</span> np.mean(data_x)</span>
+<span id="cb29-442"><a href="#cb29-442" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-443"><a href="#cb29-443" aria-hidden="true" tabindex="-1"></a><span class="co"># Create the 3D plot</span></span>
+<span id="cb29-444"><a href="#cb29-444" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> plt.figure(figsize<span class="op">=</span>(<span class="dv">7</span>, <span class="dv">5</span>))</span>
+<span id="cb29-445"><a href="#cb29-445" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> fig.add_subplot(<span class="dv">111</span>, projection<span class="op">=</span><span class="st">"3d"</span>)</span>
+<span id="cb29-446"><a href="#cb29-446" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-447"><a href="#cb29-447" aria-hidden="true" tabindex="-1"></a>X, Y <span class="op">=</span> np.meshgrid(theta_0_values, theta_1_values)</span>
+<span id="cb29-448"><a href="#cb29-448" aria-hidden="true" tabindex="-1"></a>surf <span class="op">=</span> ax.plot_surface(</span>
+<span id="cb29-449"><a href="#cb29-449" aria-hidden="true" tabindex="-1"></a>    X, Y, mse_values, cmap<span class="op">=</span><span class="st">"viridis"</span>, alpha<span class="op">=</span><span class="fl">0.6</span></span>
+<span id="cb29-450"><a href="#cb29-450" aria-hidden="true" tabindex="-1"></a>)  <span class="co"># Use alpha to make it slightly transparent</span></span>
+<span id="cb29-451"><a href="#cb29-451" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-452"><a href="#cb29-452" aria-hidden="true" tabindex="-1"></a><span class="co"># Scatter point using matplotlib</span></span>
+<span id="cb29-453"><a href="#cb29-453" aria-hidden="true" tabindex="-1"></a>sc <span class="op">=</span> ax.scatter(</span>
+<span id="cb29-454"><a href="#cb29-454" aria-hidden="true" tabindex="-1"></a>    [theta_0_hat],</span>
+<span id="cb29-455"><a href="#cb29-455" aria-hidden="true" tabindex="-1"></a>    [theta_1_hat],</span>
+<span id="cb29-456"><a href="#cb29-456" aria-hidden="true" tabindex="-1"></a>    [mse_linear(theta_0_hat, theta_1_hat, data_linear)],</span>
+<span id="cb29-457"><a href="#cb29-457" aria-hidden="true" tabindex="-1"></a>    marker<span class="op">=</span><span class="st">"o"</span>,</span>
+<span id="cb29-458"><a href="#cb29-458" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">"red"</span>,</span>
+<span id="cb29-459"><a href="#cb29-459" aria-hidden="true" tabindex="-1"></a>    s<span class="op">=</span><span class="dv">100</span>,</span>
+<span id="cb29-460"><a href="#cb29-460" aria-hidden="true" tabindex="-1"></a>    label<span class="op">=</span><span class="st">"theta hat"</span>,</span>
+<span id="cb29-461"><a href="#cb29-461" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb29-462"><a href="#cb29-462" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-463"><a href="#cb29-463" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a colorbar</span></span>
+<span id="cb29-464"><a href="#cb29-464" aria-hidden="true" tabindex="-1"></a>cbar <span class="op">=</span> fig.colorbar(surf, ax<span class="op">=</span>ax, shrink<span class="op">=</span><span class="fl">0.5</span>, aspect<span class="op">=</span><span class="dv">10</span>)</span>
+<span id="cb29-465"><a href="#cb29-465" aria-hidden="true" tabindex="-1"></a>cbar.set_label(<span class="st">"Cost Value"</span>)</span>
+<span id="cb29-466"><a href="#cb29-466" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-467"><a href="#cb29-467" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">"MSE for different $</span><span class="ch">\\</span><span class="st">theta_0, </span><span class="ch">\\</span><span class="st">theta_1$"</span>)</span>
+<span id="cb29-468"><a href="#cb29-468" aria-hidden="true" tabindex="-1"></a>ax.set_xlabel(<span class="st">"$</span><span class="ch">\\</span><span class="st">theta_0$"</span>)</span>
+<span id="cb29-469"><a href="#cb29-469" aria-hidden="true" tabindex="-1"></a>ax.set_ylabel(<span class="st">"$</span><span class="ch">\\</span><span class="st">theta_1$"</span>)</span>
+<span id="cb29-470"><a href="#cb29-470" aria-hidden="true" tabindex="-1"></a>ax.set_zlabel(<span class="st">"MSE"</span>)</span>
+<span id="cb29-471"><a href="#cb29-471" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-472"><a href="#cb29-472" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.show()</span></span>
+<span id="cb29-473"><a href="#cb29-473" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-474"><a href="#cb29-474" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-477"><a href="#cb29-477" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-478"><a href="#cb29-478" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-479"><a href="#cb29-479" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-480"><a href="#cb29-480" aria-hidden="true" tabindex="-1"></a><span class="co"># Predictions</span></span>
+<span id="cb29-481"><a href="#cb29-481" aria-hidden="true" tabindex="-1"></a>yobs <span class="op">=</span> data_linear[<span class="st">"Age"</span>]  <span class="co"># The true observations y</span></span>
+<span id="cb29-482"><a href="#cb29-482" aria-hidden="true" tabindex="-1"></a>xs <span class="op">=</span> data_linear[<span class="st">"Length"</span>]  <span class="co"># Needed for linear predictions</span></span>
+<span id="cb29-483"><a href="#cb29-483" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(yobs)  <span class="co"># Predictions</span></span>
+<span id="cb29-484"><a href="#cb29-484" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-485"><a href="#cb29-485" aria-hidden="true" tabindex="-1"></a>yhats_constant <span class="op">=</span> [thetahat <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(n)]  <span class="co"># Not used, but food for thought</span></span>
+<span id="cb29-486"><a href="#cb29-486" aria-hidden="true" tabindex="-1"></a>yhats_linear <span class="op">=</span> [theta_0_hat <span class="op">+</span> theta_1_hat <span class="op">*</span> x <span class="cf">for</span> x <span class="kw">in</span> xs]</span>
+<span id="cb29-487"><a href="#cb29-487" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-488"><a href="#cb29-488" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-491"><a href="#cb29-491" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-492"><a href="#cb29-492" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-493"><a href="#cb29-493" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-494"><a href="#cb29-494" aria-hidden="true" tabindex="-1"></a><span class="co"># Constant Model Rug Plot</span></span>
+<span id="cb29-495"><a href="#cb29-495" aria-hidden="true" tabindex="-1"></a><span class="co"># In case we're in a weird style state</span></span>
+<span id="cb29-496"><a href="#cb29-496" aria-hidden="true" tabindex="-1"></a>sns.set_theme()</span>
+<span id="cb29-497"><a href="#cb29-497" aria-hidden="true" tabindex="-1"></a>adjust_fontsize(size<span class="op">=</span><span class="dv">16</span>)</span>
+<span id="cb29-498"><a href="#cb29-498" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
+<span id="cb29-499"><a href="#cb29-499" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-500"><a href="#cb29-500" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> plt.figure(figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="fl">1.5</span>))</span>
+<span id="cb29-501"><a href="#cb29-501" aria-hidden="true" tabindex="-1"></a>sns.rugplot(yobs, height<span class="op">=</span><span class="fl">0.25</span>, lw<span class="op">=</span><span class="dv">2</span>) <span class="op">;</span></span>
+<span id="cb29-502"><a href="#cb29-502" aria-hidden="true" tabindex="-1"></a>plt.axvline(thetahat, color<span class="op">=</span><span class="st">'red'</span>, lw<span class="op">=</span><span class="dv">4</span>, label<span class="op">=</span><span class="vs">r"$\hat{\theta}_0$"</span>)<span class="op">;</span></span>
+<span id="cb29-503"><a href="#cb29-503" aria-hidden="true" tabindex="-1"></a>plt.legend()</span>
+<span id="cb29-504"><a href="#cb29-504" aria-hidden="true" tabindex="-1"></a>plt.yticks([])<span class="op">;</span></span>
+<span id="cb29-505"><a href="#cb29-505" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.show()</span></span>
+<span id="cb29-506"><a href="#cb29-506" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-507"><a href="#cb29-507" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-510"><a href="#cb29-510" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-511"><a href="#cb29-511" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-512"><a href="#cb29-512" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-513"><a href="#cb29-513" aria-hidden="true" tabindex="-1"></a><span class="co"># SLR model scatter plot </span></span>
+<span id="cb29-514"><a href="#cb29-514" aria-hidden="true" tabindex="-1"></a><span class="co"># In case we're in a weird style state</span></span>
+<span id="cb29-515"><a href="#cb29-515" aria-hidden="true" tabindex="-1"></a>sns.set_theme()</span>
+<span id="cb29-516"><a href="#cb29-516" aria-hidden="true" tabindex="-1"></a>adjust_fontsize(size<span class="op">=</span><span class="dv">16</span>)</span>
+<span id="cb29-517"><a href="#cb29-517" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
+<span id="cb29-518"><a href="#cb29-518" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-519"><a href="#cb29-519" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(x<span class="op">=</span>xs, y<span class="op">=</span>yobs)</span>
+<span id="cb29-520"><a href="#cb29-520" aria-hidden="true" tabindex="-1"></a>plt.plot(xs, yhats_linear, color<span class="op">=</span><span class="st">'red'</span>, lw<span class="op">=</span><span class="dv">4</span>)<span class="op">;</span></span>
+<span id="cb29-521"><a href="#cb29-521" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.savefig('dugong_line.png', bbox_inches = 'tight');</span></span>
+<span id="cb29-522"><a href="#cb29-522" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.show()</span></span>
+<span id="cb29-523"><a href="#cb29-523" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-524"><a href="#cb29-524" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-525"><a href="#cb29-525" aria-hidden="true" tabindex="-1"></a>Interpreting the RMSE (Root Mean Squared Error):</span>
+<span id="cb29-526"><a href="#cb29-526" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-527"><a href="#cb29-527" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Because the constant error is **HIGHER** than the linear error,</span>
+<span id="cb29-528"><a href="#cb29-528" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The constant model is **WORSE** than the linear model (at least for this metric).</span>
+<span id="cb29-529"><a href="#cb29-529" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-530"><a href="#cb29-530" aria-hidden="true" tabindex="-1"></a><span class="fu">## Constant Model + MAE</span></span>
+<span id="cb29-531"><a href="#cb29-531" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-532"><a href="#cb29-532" aria-hidden="true" tabindex="-1"></a>We see now that changing the model used for prediction leads to a wildly different result for the optimal model parameter. What happens if we instead change the loss function used in model evaluation?</span>
+<span id="cb29-533"><a href="#cb29-533" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-534"><a href="#cb29-534" aria-hidden="true" tabindex="-1"></a>This time, we will consider the constant model with L1 (absolute loss) as the loss function. This means that the average loss will be expressed as the **Mean Absolute Error (MAE)**.</span>
+<span id="cb29-535"><a href="#cb29-535" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-536"><a href="#cb29-536" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Choose a model: constant model</span>
+<span id="cb29-537"><a href="#cb29-537" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Choose a loss function: L1 loss</span>
+<span id="cb29-538"><a href="#cb29-538" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Fit the model</span>
+<span id="cb29-539"><a href="#cb29-539" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>Evaluate model performance</span>
+<span id="cb29-540"><a href="#cb29-540" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-541"><a href="#cb29-541" aria-hidden="true" tabindex="-1"></a><span class="fu">### Deriving the optimal $\theta_0$</span></span>
+<span id="cb29-542"><a href="#cb29-542" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-543"><a href="#cb29-543" aria-hidden="true" tabindex="-1"></a>Recall that the MAE is average **absolute** loss (L1 loss) over the data $D = <span class="sc">\{</span>y_1, y_2, ..., y_n<span class="sc">\}</span>$.</span>
+<span id="cb29-544"><a href="#cb29-544" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-545"><a href="#cb29-545" aria-hidden="true" tabindex="-1"></a>$$\hat{R}(\theta_0) = \frac{1}{n}\sum^{n}_{i=1} |y_i - \hat{y_i}| $$</span>
+<span id="cb29-546"><a href="#cb29-546" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-547"><a href="#cb29-547" aria-hidden="true" tabindex="-1"></a>Given the constant model $\hat{y} = \theta_0$, we can write the MAE as:</span>
+<span id="cb29-548"><a href="#cb29-548" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-549"><a href="#cb29-549" aria-hidden="true" tabindex="-1"></a>$$\hat{R}(\theta_0) = \frac{1}{n}\sum^{n}_{i=1} |y_i - \theta_0| $$</span>
+<span id="cb29-550"><a href="#cb29-550" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-551"><a href="#cb29-551" aria-hidden="true" tabindex="-1"></a>To fit the model, we find the optimal parameter value $\hat{\theta_0}$ that minimizes the MAE by differentiating using a calculus approach:</span>
+<span id="cb29-552"><a href="#cb29-552" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-553"><a href="#cb29-553" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Differentiate with respect to $\hat{\theta_0}$:</span>
+<span id="cb29-554"><a href="#cb29-554" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-555"><a href="#cb29-555" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-556"><a href="#cb29-556" aria-hidden="true" tabindex="-1"></a>\begin{align}</span>
+<span id="cb29-557"><a href="#cb29-557" aria-hidden="true" tabindex="-1"></a>\hat{R}(\theta_0) &amp;= \frac{1}{n}\sum^{n}_{i=1} |y_i - \theta_0| <span class="sc">\\</span></span>
+<span id="cb29-558"><a href="#cb29-558" aria-hidden="true" tabindex="-1"></a>\frac{d}{d\theta_0} R(\theta_0) &amp;= \frac{d}{d\theta_0} \left(\frac{1}{n} \sum^{n}_{i=1} |y_i - \theta_0| \right) <span class="sc">\\</span></span>
+<span id="cb29-559"><a href="#cb29-559" aria-hidden="true" tabindex="-1"></a>&amp;= \frac{1}{n} \sum^{n}_{i=1} \frac{d}{d\theta_0} |y_i - \theta_0|</span>
+<span id="cb29-560"><a href="#cb29-560" aria-hidden="true" tabindex="-1"></a>\end{align}</span>
+<span id="cb29-561"><a href="#cb29-561" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-562"><a href="#cb29-562" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-563"><a href="#cb29-563" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Here, we seem to have run into a problem: the derivative of an absolute value is undefined when the argument is 0 (i.e. when $y_i = \theta_0$). For now, we'll ignore this issue. It turns out that disregarding this case doesn't influence our final result.</span>
+<span id="cb29-564"><a href="#cb29-564" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>To perform the derivative, consider two cases. When $\theta_0$ is *less than or equal to* $y_i$, the term $y_i - \theta_0$ will be positive and the absolute value has no impact. When $\theta_0$ is *greater than* $y_i$, the term $y_i - \theta_0$ will be negative. Applying the absolute value will convert this to a positive value, which we can express by saying $-(y_i - \theta_0) = \theta_0 - y_i$.</span>
+<span id="cb29-565"><a href="#cb29-565" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-566"><a href="#cb29-566" aria-hidden="true" tabindex="-1"></a>$$|y_i - \theta_0| = \begin{cases} y_i - \theta_0 \quad \text{ if } \theta_0 \le y_i <span class="sc">\\</span> \theta_0 - y_i \quad \text{if }\theta_0 &gt; y_i \end{cases}$$</span>
+<span id="cb29-567"><a href="#cb29-567" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-568"><a href="#cb29-568" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Taking derivatives:</span>
+<span id="cb29-569"><a href="#cb29-569" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-570"><a href="#cb29-570" aria-hidden="true" tabindex="-1"></a>$$\frac{d}{d\theta_0} |y_i - \theta_0| = \begin{cases} \frac{d}{d\theta_0} (y_i - \theta_0) = -1 \quad \text{if }\theta_0 &lt; y_i <span class="sc">\\</span> \frac{d}{d\theta_0} (\theta_0 - y_i) = 1 \quad \text{if }\theta_0 &gt; y_i \end{cases}$$</span>
+<span id="cb29-571"><a href="#cb29-571" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-572"><a href="#cb29-572" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>This means that we obtain a different value for the derivative for data points where $\theta_0 &lt; y_i$ and where $\theta_0 &gt; y_i$. We can summarize this by saying:</span>
+<span id="cb29-573"><a href="#cb29-573" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-574"><a href="#cb29-574" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-575"><a href="#cb29-575" aria-hidden="true" tabindex="-1"></a>\frac{d}{d\theta_0} R(\theta_0) = \frac{1}{n} \sum^{n}_{i=1} \frac{d}{d\theta_0} |y_i - \theta_0| <span class="sc">\\</span></span>
+<span id="cb29-576"><a href="#cb29-576" aria-hidden="true" tabindex="-1"></a>= \frac{1}{n} \left<span class="co">[</span><span class="ot">\sum_{\theta_0 &lt; y_i} (-1) + \sum_{\theta_0 &gt; y_i} (+1) \right</span><span class="co">]</span></span>
+<span id="cb29-577"><a href="#cb29-577" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-578"><a href="#cb29-578" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-579"><a href="#cb29-579" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>In other words, we take the sum of values for $i = 1, 2, ..., n$:</span>
+<span id="cb29-580"><a href="#cb29-580" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>$-1$ if our observation $y_i$ is *greater than* our prediction $\hat{\theta_0}$</span>
+<span id="cb29-581"><a href="#cb29-581" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>$+1$ if our observation $y_i$ is *smaller than* our prediction $\hat{\theta_0}$</span>
+<span id="cb29-582"><a href="#cb29-582" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-583"><a href="#cb29-583" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Set the derivative equation equal to 0:</span>
+<span id="cb29-584"><a href="#cb29-584" aria-hidden="true" tabindex="-1"></a>   $$ 0 = \frac{1}{n}\sum_{\hat{\theta_0} &lt; y_i} (-1) + \frac{1}{n}\sum_{\hat{\theta_0} &gt; y_i} (+1) $$</span>
+<span id="cb29-585"><a href="#cb29-585" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-586"><a href="#cb29-586" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Solve for $\hat{\theta_0}$:</span>
+<span id="cb29-587"><a href="#cb29-587" aria-hidden="true" tabindex="-1"></a>   $$ 0 = -\frac{1}{n}\sum_{\hat{\theta_0} &lt; y_i} (1) + \frac{1}{n}\sum_{\hat{\theta_0} &gt; y_i} (1)$$</span>
+<span id="cb29-588"><a href="#cb29-588" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-589"><a href="#cb29-589" aria-hidden="true" tabindex="-1"></a>$$\sum_{\hat{\theta_0} &lt; y_i} (1) = \sum_{\hat{\theta_0} &gt; y_i} (1) $$</span>
+<span id="cb29-590"><a href="#cb29-590" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-591"><a href="#cb29-591" aria-hidden="true" tabindex="-1"></a>Thus, the constant model parameter $\theta = \hat{\theta_0}$ that minimizes MAE must satisfy:</span>
+<span id="cb29-592"><a href="#cb29-592" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-593"><a href="#cb29-593" aria-hidden="true" tabindex="-1"></a>$$ \sum_{\hat{\theta_0} &lt; y_i} (1) = \sum_{\hat{\theta_0} &gt; y_i} (1) $$</span>
+<span id="cb29-594"><a href="#cb29-594" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-595"><a href="#cb29-595" aria-hidden="true" tabindex="-1"></a>In other words, the number of observations greater than $\theta_0$ must be equal to the number of observations less than $\theta_0$; there must be an equal number of points on the left and right sides of the equation. This is the definition of median, so our optimal value is</span>
+<span id="cb29-596"><a href="#cb29-596" aria-hidden="true" tabindex="-1"></a>$$ \hat{\theta_0} = median(y) $$</span>
+<span id="cb29-597"><a href="#cb29-597" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-598"><a href="#cb29-598" aria-hidden="true" tabindex="-1"></a><span class="fu">## Summary: Loss Optimization, Calculus, and Critical Points</span></span>
+<span id="cb29-599"><a href="#cb29-599" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-600"><a href="#cb29-600" aria-hidden="true" tabindex="-1"></a>First, define the **objective function** as average loss.</span>
+<span id="cb29-601"><a href="#cb29-601" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-602"><a href="#cb29-602" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Plug in L1 or L2 loss.</span>
+<span id="cb29-603"><a href="#cb29-603" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Plug in the model so that the resulting expression is a function of $\theta$.</span>
+<span id="cb29-604"><a href="#cb29-604" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-605"><a href="#cb29-605" aria-hidden="true" tabindex="-1"></a>Then, find the minimum of the objective function:</span>
+<span id="cb29-606"><a href="#cb29-606" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-607"><a href="#cb29-607" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Differentiate with respect to $\theta$.</span>
+<span id="cb29-608"><a href="#cb29-608" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Set equal to 0.</span>
+<span id="cb29-609"><a href="#cb29-609" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Solve for $\hat{\theta}$.</span>
+<span id="cb29-610"><a href="#cb29-610" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>(If we have multiple parameters) repeat steps 1-3 with partial derivatives.</span>
+<span id="cb29-611"><a href="#cb29-611" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-612"><a href="#cb29-612" aria-hidden="true" tabindex="-1"></a>Recall critical points from calculus: $R(\hat{\theta})$ could be a minimum, maximum, or saddle point!</span>
+<span id="cb29-613"><a href="#cb29-613" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-614"><a href="#cb29-614" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>We should technically also perform the second derivative test, i.e., show $R''(\hat{\theta}) &gt; 0$.</span>
+<span id="cb29-615"><a href="#cb29-615" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>MSE has a property—**convexity**—that guarantees that $R(\hat{\theta})$ is a global minimum.</span>
+<span id="cb29-616"><a href="#cb29-616" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The proof of convexity for MAE is beyond this course.</span>
+<span id="cb29-617"><a href="#cb29-617" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-618"><a href="#cb29-618" aria-hidden="true" tabindex="-1"></a><span class="fu">## Comparing Loss Functions</span></span>
+<span id="cb29-619"><a href="#cb29-619" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-620"><a href="#cb29-620" aria-hidden="true" tabindex="-1"></a>We've now tried our hand at fitting a model under both MSE and MAE cost functions. How do the two results compare?</span>
+<span id="cb29-621"><a href="#cb29-621" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-622"><a href="#cb29-622" aria-hidden="true" tabindex="-1"></a>Let's consider a dataset where each entry represents the number of drinks sold at a bubble tea store each day. We'll fit a constant model to predict the number of drinks that will be sold tomorrow.</span>
+<span id="cb29-623"><a href="#cb29-623" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-626"><a href="#cb29-626" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-627"><a href="#cb29-627" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb29-628"><a href="#cb29-628" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-629"><a href="#cb29-629" aria-hidden="true" tabindex="-1"></a>drinks <span class="op">=</span> np.array([<span class="dv">20</span>, <span class="dv">21</span>, <span class="dv">22</span>, <span class="dv">29</span>, <span class="dv">33</span>])</span>
+<span id="cb29-630"><a href="#cb29-630" aria-hidden="true" tabindex="-1"></a>drinks</span>
+<span id="cb29-631"><a href="#cb29-631" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-632"><a href="#cb29-632" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-633"><a href="#cb29-633" aria-hidden="true" tabindex="-1"></a>From our derivations above, we know that the optimal model parameter under MSE cost is the mean of the dataset. Under MAE cost, the optimal parameter is the median of the dataset.</span>
+<span id="cb29-634"><a href="#cb29-634" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-637"><a href="#cb29-637" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-638"><a href="#cb29-638" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb29-639"><a href="#cb29-639" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-640"><a href="#cb29-640" aria-hidden="true" tabindex="-1"></a>np.mean(drinks), np.median(drinks)</span>
+<span id="cb29-641"><a href="#cb29-641" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-642"><a href="#cb29-642" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-643"><a href="#cb29-643" aria-hidden="true" tabindex="-1"></a>If we plot each empirical risk function across several possible values of $\theta$, we find that each $\hat{\theta}$ does indeed correspond to the lowest value of error:</span>
+<span id="cb29-644"><a href="#cb29-644" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-645"><a href="#cb29-645" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/error.png" alt='error' width='600'&gt;</span>
+<span id="cb29-646"><a href="#cb29-646" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-647"><a href="#cb29-647" aria-hidden="true" tabindex="-1"></a>Notice that the MSE above is a **smooth** function – it is differentiable at all points, making it easy to minimize using numerical methods. The MAE, in contrast, is not differentiable at each of its "kinks." We'll explore how the smoothness of the cost function can impact our ability to apply numerical optimization in a few weeks.</span>
+<span id="cb29-648"><a href="#cb29-648" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-649"><a href="#cb29-649" aria-hidden="true" tabindex="-1"></a>How do outliers affect each cost function? Imagine we replace the largest value in the dataset with 1000. The mean of the data increases substantially, while the median is nearly unaffected.</span>
+<span id="cb29-650"><a href="#cb29-650" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-653"><a href="#cb29-653" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-654"><a href="#cb29-654" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb29-655"><a href="#cb29-655" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-656"><a href="#cb29-656" aria-hidden="true" tabindex="-1"></a>drinks_with_outlier <span class="op">=</span> np.append(drinks, <span class="dv">1033</span>)</span>
+<span id="cb29-657"><a href="#cb29-657" aria-hidden="true" tabindex="-1"></a>display(drinks_with_outlier)</span>
+<span id="cb29-658"><a href="#cb29-658" aria-hidden="true" tabindex="-1"></a>np.mean(drinks_with_outlier), np.median(drinks_with_outlier)</span>
+<span id="cb29-659"><a href="#cb29-659" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-660"><a href="#cb29-660" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-661"><a href="#cb29-661" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/outliers.png" alt='outliers' width='700'&gt;</span>
+<span id="cb29-662"><a href="#cb29-662" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-663"><a href="#cb29-663" aria-hidden="true" tabindex="-1"></a>This means that under the MSE, the optimal model parameter $\hat{\theta}$ is strongly affected by the presence of outliers. Under the MAE, the optimal parameter is not as influenced by outlying data. We can generalize this by saying that the MSE is **sensitive** to outliers, while the MAE is **robust** to outliers.</span>
+<span id="cb29-664"><a href="#cb29-664" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-665"><a href="#cb29-665" aria-hidden="true" tabindex="-1"></a>Let's try another experiment. This time, we'll add an additional, non-outlying datapoint to the data.</span>
+<span id="cb29-666"><a href="#cb29-666" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-669"><a href="#cb29-669" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-670"><a href="#cb29-670" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb29-671"><a href="#cb29-671" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-672"><a href="#cb29-672" aria-hidden="true" tabindex="-1"></a>drinks_with_additional_observation <span class="op">=</span> np.append(drinks, <span class="dv">35</span>)</span>
+<span id="cb29-673"><a href="#cb29-673" aria-hidden="true" tabindex="-1"></a>drinks_with_additional_observation</span>
+<span id="cb29-674"><a href="#cb29-674" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-675"><a href="#cb29-675" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-676"><a href="#cb29-676" aria-hidden="true" tabindex="-1"></a>When we again visualize the cost functions, we find that the MAE now plots a horizontal line between 22 and 29. This means that there are _infinitely_ many optimal values for the model parameter: any value $\hat{\theta} \in <span class="co">[</span><span class="ot">22, 29</span><span class="co">]</span>$ will minimize the MAE. In contrast, the MSE still has a single best value for $\hat{\theta}$. In other words, the MSE has a **unique** solution for $\hat{\theta}$; the MAE is not guaranteed to have a single unique solution.</span>
+<span id="cb29-677"><a href="#cb29-677" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-678"><a href="#cb29-678" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/mse_loss_26.png" width='350'&gt; &lt;img src="images/mae_loss_infinite.png" width='350'&gt;</span>
+<span id="cb29-679"><a href="#cb29-679" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-680"><a href="#cb29-680" aria-hidden="true" tabindex="-1"></a>To summarize our example,</span>
+<span id="cb29-681"><a href="#cb29-681" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-682"><a href="#cb29-682" aria-hidden="true" tabindex="-1"></a>|                        | MSE (Mean Squared Loss)                                                   | MAE (Mean Absolute Loss)                                                        |</span>
+<span id="cb29-683"><a href="#cb29-683" aria-hidden="true" tabindex="-1"></a>| ---------------------- | ---------------------------------------------------------------- | ------------------------------------------------------------------------------- |</span>
+<span id="cb29-684"><a href="#cb29-684" aria-hidden="true" tabindex="-1"></a>| Loss Function                  | $\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2$                                            | $\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} |y_i - \theta_0|$                                               |</span>
+<span id="cb29-685"><a href="#cb29-685" aria-hidden="true" tabindex="-1"></a>| Optimal $\hat{\theta_0}$                   | $\hat{\theta_0} = mean(y) = \bar{y}$                      | $\hat{\theta_0} = median(y)$                 |</span>
+<span id="cb29-686"><a href="#cb29-686" aria-hidden="true" tabindex="-1"></a>| Loss Surface           | &lt;img src="images/mse_loss_26.png" width='250'&gt;                        | &lt;img src="images/mae_loss_infinite.png" width='250'&gt;                                          |</span>
+<span id="cb29-687"><a href="#cb29-687" aria-hidden="true" tabindex="-1"></a>| Shape            | **Smooth** - easy to minimize using numerical methods (in a few weeks)  | **Piecewise** - at each of the “kinks,” it’s not differentiable. Harder to minimize. |</span>
+<span id="cb29-688"><a href="#cb29-688" aria-hidden="true" tabindex="-1"></a>| Outliers                   | **Sensitive** to outliers (since they change mean substantially). Sensitivity also depends on the dataset size.                                                            | **More robust** to outliers.                                                                           |</span>
+<span id="cb29-689"><a href="#cb29-689" aria-hidden="true" tabindex="-1"></a>| $\hat{\theta_0}$ Uniqueness | **Unique** $\hat{\theta_0}$                              | **Infinitely many** $\hat{\theta_0}$s                                    |</span>
+<span id="cb29-690"><a href="#cb29-690" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-691"><a href="#cb29-691" aria-hidden="true" tabindex="-1"></a><span class="fu">## Transformations to fit Linear Models</span></span>
+<span id="cb29-692"><a href="#cb29-692" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-693"><a href="#cb29-693" aria-hidden="true" tabindex="-1"></a>At this point, we have an effective method of fitting models to predict linear relationships. Given a feature variable and target, we can apply our four-step process to find the optimal model parameters.</span>
+<span id="cb29-694"><a href="#cb29-694" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-695"><a href="#cb29-695" aria-hidden="true" tabindex="-1"></a>A key word above is _linear_. When we computed parameter estimates earlier, we assumed that $x_i$ and $y_i$ shared a roughly linear relationship. Data in the real world isn't always so straightforward, but we can transform the data to try and obtain linearity.</span>
+<span id="cb29-696"><a href="#cb29-696" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-697"><a href="#cb29-697" aria-hidden="true" tabindex="-1"></a>The **Tukey-Mosteller Bulge Diagram** is a useful tool for summarizing what transformations can linearize the relationship between two variables. To determine what transformations might be appropriate, trace the shape of the "bulge" made by your data. Find the quadrant of the diagram that matches this bulge. The transformations shown on the vertical and horizontal axes of this quadrant can help improve the fit between the variables.</span>
+<span id="cb29-698"><a href="#cb29-698" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-699"><a href="#cb29-699" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/bulge.png" alt='bulge' width='600'&gt;</span>
+<span id="cb29-700"><a href="#cb29-700" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-701"><a href="#cb29-701" aria-hidden="true" tabindex="-1"></a>Note that:</span>
+<span id="cb29-702"><a href="#cb29-702" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-703"><a href="#cb29-703" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>There are multiple solutions. Some will fit better than others.</span>
+<span id="cb29-704"><a href="#cb29-704" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>sqrt and log make a value “smaller.”</span>
+<span id="cb29-705"><a href="#cb29-705" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Raising to a power makes a value “bigger.”</span>
+<span id="cb29-706"><a href="#cb29-706" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Each of these transformations equates to increasing or decreasing the scale of an axis.</span>
+<span id="cb29-707"><a href="#cb29-707" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-708"><a href="#cb29-708" aria-hidden="true" tabindex="-1"></a>Other goals in addition to linearity are possible, for example, making data appear more symmetric.</span>
+<span id="cb29-709"><a href="#cb29-709" aria-hidden="true" tabindex="-1"></a>Linearity allows us to fit lines to the transformed data.</span>
+<span id="cb29-710"><a href="#cb29-710" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-711"><a href="#cb29-711" aria-hidden="true" tabindex="-1"></a>Let's revisit our dugongs example. The lengths and ages are plotted below:</span>
+<span id="cb29-712"><a href="#cb29-712" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-715"><a href="#cb29-715" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-716"><a href="#cb29-716" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-717"><a href="#cb29-717" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-718"><a href="#cb29-718" aria-hidden="true" tabindex="-1"></a><span class="co"># `corrcoef` computes the correlation coefficient between two variables</span></span>
+<span id="cb29-719"><a href="#cb29-719" aria-hidden="true" tabindex="-1"></a><span class="co"># `std` finds the standard deviation</span></span>
+<span id="cb29-720"><a href="#cb29-720" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> dugongs[<span class="st">"Length"</span>]</span>
+<span id="cb29-721"><a href="#cb29-721" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> dugongs[<span class="st">"Age"</span>]</span>
+<span id="cb29-722"><a href="#cb29-722" aria-hidden="true" tabindex="-1"></a>r <span class="op">=</span> np.corrcoef(x, y)[<span class="dv">0</span>, <span class="dv">1</span>]</span>
+<span id="cb29-723"><a href="#cb29-723" aria-hidden="true" tabindex="-1"></a>theta_1 <span class="op">=</span> r <span class="op">*</span> np.std(y) <span class="op">/</span> np.std(x)</span>
+<span id="cb29-724"><a href="#cb29-724" aria-hidden="true" tabindex="-1"></a>theta_0 <span class="op">=</span> np.mean(y) <span class="op">-</span> theta_1 <span class="op">*</span> np.mean(x)</span>
+<span id="cb29-725"><a href="#cb29-725" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-726"><a href="#cb29-726" aria-hidden="true" tabindex="-1"></a>fig, ax <span class="op">=</span> plt.subplots(<span class="dv">1</span>, <span class="dv">2</span>, dpi<span class="op">=</span><span class="dv">200</span>, figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="dv">3</span>))</span>
+<span id="cb29-727"><a href="#cb29-727" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].scatter(x, y)</span>
+<span id="cb29-728"><a href="#cb29-728" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].set_xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb29-729"><a href="#cb29-729" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].set_ylabel(<span class="st">"Age"</span>)</span>
+<span id="cb29-730"><a href="#cb29-730" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-731"><a href="#cb29-731" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].scatter(x, y)</span>
+<span id="cb29-732"><a href="#cb29-732" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].plot(x, theta_0 <span class="op">+</span> theta_1 <span class="op">*</span> x, <span class="st">"tab:red"</span>)</span>
+<span id="cb29-733"><a href="#cb29-733" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].set_xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb29-734"><a href="#cb29-734" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].set_ylabel(<span class="st">"Age"</span>)</span>
+<span id="cb29-735"><a href="#cb29-735" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-736"><a href="#cb29-736" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-737"><a href="#cb29-737" aria-hidden="true" tabindex="-1"></a>Looking at the plot on the left, we see that there is a slight curvature to the data points. Plotting the SLR curve on the right results in a poor fit.</span>
+<span id="cb29-738"><a href="#cb29-738" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-739"><a href="#cb29-739" aria-hidden="true" tabindex="-1"></a>For SLR to perform well, we'd like there to be a rough linear trend relating <span class="in">`"Age"`</span> and <span class="in">`"Length"`</span>. What is making the raw data deviate from a linear relationship? Notice that the data points with <span class="in">`"Length"`</span> greater than 2.6 have disproportionately high values of <span class="in">`"Age"`</span> relative to the rest of the data. If we could manipulate these data points to have lower <span class="in">`"Age"`</span> values, we'd "shift" these points downwards and reduce the curvature in the data. Applying a logarithmic transformation to $y_i$ (that is, taking $\log($ <span class="in">`"Age"`</span> $)$ ) would achieve just that.</span>
+<span id="cb29-740"><a href="#cb29-740" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-741"><a href="#cb29-741" aria-hidden="true" tabindex="-1"></a>An important word on $\log$: in Data 100 (and most upper-division STEM courses), $\log$ denotes the natural logarithm with base $e$. The base-10 logarithm, where relevant, is indicated by $\log_{10}$.</span>
+<span id="cb29-742"><a href="#cb29-742" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-745"><a href="#cb29-745" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-746"><a href="#cb29-746" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-747"><a href="#cb29-747" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-748"><a href="#cb29-748" aria-hidden="true" tabindex="-1"></a>z <span class="op">=</span> np.log(y)</span>
+<span id="cb29-749"><a href="#cb29-749" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-750"><a href="#cb29-750" aria-hidden="true" tabindex="-1"></a>r <span class="op">=</span> np.corrcoef(x, z)[<span class="dv">0</span>, <span class="dv">1</span>]</span>
+<span id="cb29-751"><a href="#cb29-751" aria-hidden="true" tabindex="-1"></a>theta_1 <span class="op">=</span> r <span class="op">*</span> np.std(z) <span class="op">/</span> np.std(x)</span>
+<span id="cb29-752"><a href="#cb29-752" aria-hidden="true" tabindex="-1"></a>theta_0 <span class="op">=</span> np.mean(z) <span class="op">-</span> theta_1 <span class="op">*</span> np.mean(x)</span>
+<span id="cb29-753"><a href="#cb29-753" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-754"><a href="#cb29-754" aria-hidden="true" tabindex="-1"></a>fig, ax <span class="op">=</span> plt.subplots(<span class="dv">1</span>, <span class="dv">2</span>, dpi<span class="op">=</span><span class="dv">200</span>, figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="dv">3</span>))</span>
+<span id="cb29-755"><a href="#cb29-755" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].scatter(x, z)</span>
+<span id="cb29-756"><a href="#cb29-756" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].set_xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb29-757"><a href="#cb29-757" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">0</span>].set_ylabel(<span class="vs">r"$\log{(Age)}$"</span>)</span>
+<span id="cb29-758"><a href="#cb29-758" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-759"><a href="#cb29-759" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].scatter(x, z)</span>
+<span id="cb29-760"><a href="#cb29-760" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].plot(x, theta_0 <span class="op">+</span> theta_1 <span class="op">*</span> x, <span class="st">"tab:red"</span>)</span>
+<span id="cb29-761"><a href="#cb29-761" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].set_xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb29-762"><a href="#cb29-762" aria-hidden="true" tabindex="-1"></a>ax[<span class="dv">1</span>].set_ylabel(<span class="vs">r"$\log{(Age)}$"</span>)</span>
+<span id="cb29-763"><a href="#cb29-763" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-764"><a href="#cb29-764" aria-hidden="true" tabindex="-1"></a>plt.subplots_adjust(wspace<span class="op">=</span><span class="fl">0.3</span>)</span>
+<span id="cb29-765"><a href="#cb29-765" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-766"><a href="#cb29-766" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-767"><a href="#cb29-767" aria-hidden="true" tabindex="-1"></a>Our SLR fit looks a lot better! We now have a new target variable: the SLR model is now trying to predict the _log_ of <span class="in">`"Age"`</span>, rather than the untransformed <span class="in">`"Age"`</span>. In other words, we are applying the transformation $z_i = \log{(y_i)}$. Notice that the resulting model is still **linear in the parameters** $\theta = <span class="co">[</span><span class="ot">\theta_0, \theta_1</span><span class="co">]</span>$. The SLR model becomes:</span>
+<span id="cb29-768"><a href="#cb29-768" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-769"><a href="#cb29-769" aria-hidden="true" tabindex="-1"></a>$$\hat{\log{y}} = \theta_0 + \theta_1 x$$</span>
+<span id="cb29-770"><a href="#cb29-770" aria-hidden="true" tabindex="-1"></a>$$\hat{z} = \theta_0 + \theta_1 x$$</span>
+<span id="cb29-771"><a href="#cb29-771" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-772"><a href="#cb29-772" aria-hidden="true" tabindex="-1"></a>It turns out that this linearized relationship can help us understand the underlying relationship between $x$ and $y$. If we rearrange the relationship above, we find:</span>
+<span id="cb29-773"><a href="#cb29-773" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-774"><a href="#cb29-774" aria-hidden="true" tabindex="-1"></a>$$\log{(y)} = \theta_0 + \theta_1 x$$</span>
+<span id="cb29-775"><a href="#cb29-775" aria-hidden="true" tabindex="-1"></a>$$y = e^{\theta_0 + \theta_1 x}$$</span>
+<span id="cb29-776"><a href="#cb29-776" aria-hidden="true" tabindex="-1"></a>$$y = (e^{\theta_0})e^{\theta_1 x}$$</span>
+<span id="cb29-777"><a href="#cb29-777" aria-hidden="true" tabindex="-1"></a>$$y_i = C e^{k x}$$</span>
+<span id="cb29-778"><a href="#cb29-778" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-779"><a href="#cb29-779" aria-hidden="true" tabindex="-1"></a>For some constants $C$ and $k$.</span>
+<span id="cb29-780"><a href="#cb29-780" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-781"><a href="#cb29-781" aria-hidden="true" tabindex="-1"></a>$y$ is an *exponential* function of $x$. Applying an exponential fit to the untransformed variables corroborates this finding.</span>
+<span id="cb29-782"><a href="#cb29-782" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-785"><a href="#cb29-785" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb29-786"><a href="#cb29-786" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb29-787"><a href="#cb29-787" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb29-788"><a href="#cb29-788" aria-hidden="true" tabindex="-1"></a>plt.figure(dpi<span class="op">=</span><span class="dv">120</span>, figsize<span class="op">=</span>(<span class="dv">4</span>, <span class="dv">3</span>))</span>
+<span id="cb29-789"><a href="#cb29-789" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-790"><a href="#cb29-790" aria-hidden="true" tabindex="-1"></a>plt.scatter(x, y)</span>
+<span id="cb29-791"><a href="#cb29-791" aria-hidden="true" tabindex="-1"></a>plt.plot(x, np.exp(theta_0) <span class="op">*</span> np.exp(theta_1 <span class="op">*</span> x), <span class="st">"tab:red"</span>)</span>
+<span id="cb29-792"><a href="#cb29-792" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Length"</span>)</span>
+<span id="cb29-793"><a href="#cb29-793" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Age"</span>)</span>
+<span id="cb29-794"><a href="#cb29-794" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb29-795"><a href="#cb29-795" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-796"><a href="#cb29-796" aria-hidden="true" tabindex="-1"></a>You may wonder: why did we choose to apply a log transformation specifically? Why not some other function to linearize the data?</span>
+<span id="cb29-797"><a href="#cb29-797" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-798"><a href="#cb29-798" aria-hidden="true" tabindex="-1"></a>Practically, many other mathematical operations that modify the relative scales of <span class="in">`"Age"`</span> and <span class="in">`"Length"`</span> could have worked here.</span>
+<span id="cb29-799"><a href="#cb29-799" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-800"><a href="#cb29-800" aria-hidden="true" tabindex="-1"></a><span class="fu">## Multiple Linear Regression</span></span>
+<span id="cb29-801"><a href="#cb29-801" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-802"><a href="#cb29-802" aria-hidden="true" tabindex="-1"></a>Multiple linear regression is an extension of simple linear regression that adds additional features to the model. The multiple linear regression model takes the form:</span>
+<span id="cb29-803"><a href="#cb29-803" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-804"><a href="#cb29-804" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:...\:+\:\theta_p x_{p}$$</span>
+<span id="cb29-805"><a href="#cb29-805" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-806"><a href="#cb29-806" aria-hidden="true" tabindex="-1"></a>Our predicted value of $y$, $\hat{y}$, is a linear combination of the single **observations** (features), $x_i$, and the parameters, $\theta_i$.</span>
+<span id="cb29-807"><a href="#cb29-807" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-808"><a href="#cb29-808" aria-hidden="true" tabindex="-1"></a>We'll dive deeper into Multiple Linear Regression in the next lecture.</span>
+<span id="cb29-809"><a href="#cb29-809" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-810"><a href="#cb29-810" aria-hidden="true" tabindex="-1"></a><span class="fu">## Bonus: Calculating Constant Model MSE Using an Algebraic Trick</span></span>
+<span id="cb29-811"><a href="#cb29-811" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-812"><a href="#cb29-812" aria-hidden="true" tabindex="-1"></a>Earlier, we calculated the constant model MSE using calculus. It turns out that there is a much more elegant way of performing this same minimization algebraically, without using calculus at all.</span>
+<span id="cb29-813"><a href="#cb29-813" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-814"><a href="#cb29-814" aria-hidden="true" tabindex="-1"></a>In this calculation, we use the fact that the **sum of deviations from the mean is $0$** or that $\sum_{i=1}^{n} (y_i - \bar{y}) = 0$.</span>
+<span id="cb29-815"><a href="#cb29-815" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-816"><a href="#cb29-816" aria-hidden="true" tabindex="-1"></a>Let's quickly walk through the proof for this:</span>
+<span id="cb29-817"><a href="#cb29-817" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-818"><a href="#cb29-818" aria-hidden="true" tabindex="-1"></a>\begin{align}</span>
+<span id="cb29-819"><a href="#cb29-819" aria-hidden="true" tabindex="-1"></a>\sum_{i=1}^{n} (y_i - \bar{y}) &amp;= \sum_{i=1}^{n} y_i - \sum_{i=1}^{n} \bar{y} <span class="sc">\\</span></span>
+<span id="cb29-820"><a href="#cb29-820" aria-hidden="true" tabindex="-1"></a> &amp;= \sum_{i=1}^{n} y_i - n\bar{y} <span class="sc">\\</span></span>
+<span id="cb29-821"><a href="#cb29-821" aria-hidden="true" tabindex="-1"></a> &amp;= \sum_{i=1}^{n} y_i - n\frac{1}{n}\sum_{i=1}^{n}y_i <span class="sc">\\</span></span>
+<span id="cb29-822"><a href="#cb29-822" aria-hidden="true" tabindex="-1"></a> &amp;= \sum_{i=1}^{n} y_i - \sum_{i=1}^{n}y_i <span class="sc">\\</span></span>
+<span id="cb29-823"><a href="#cb29-823" aria-hidden="true" tabindex="-1"></a> &amp; = 0</span>
+<span id="cb29-824"><a href="#cb29-824" aria-hidden="true" tabindex="-1"></a>\end{align}</span>
+<span id="cb29-825"><a href="#cb29-825" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-826"><a href="#cb29-826" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-827"><a href="#cb29-827" aria-hidden="true" tabindex="-1"></a>In our calculations, we'll also be using the definition of the variance as a sample. As a refresher:</span>
+<span id="cb29-828"><a href="#cb29-828" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-829"><a href="#cb29-829" aria-hidden="true" tabindex="-1"></a>$$\sigma_y^2 = \frac{1}{n}\sum_{i=1}^{n} (y_i - \bar{y})^2$$</span>
+<span id="cb29-830"><a href="#cb29-830" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-831"><a href="#cb29-831" aria-hidden="true" tabindex="-1"></a>Getting into our calculation for MSE minimization:</span>
+<span id="cb29-832"><a href="#cb29-832" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-833"><a href="#cb29-833" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-834"><a href="#cb29-834" aria-hidden="true" tabindex="-1"></a>\begin{align}</span>
+<span id="cb29-835"><a href="#cb29-835" aria-hidden="true" tabindex="-1"></a>R(\theta) &amp;= {\frac{1}{n}}\sum^{n}_{i=1} (y_i - \theta)^2</span>
+<span id="cb29-836"><a href="#cb29-836" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \frac{1}{n}\sum^{n}_{i=1} <span class="co">[</span><span class="ot">(y_i - \bar{y}) + (\bar{y} - \theta)</span><span class="co">]</span>^2\quad \quad \text{using trick that a-b can be written as (a-c) + (c-b) } <span class="sc">\\</span></span>
+<span id="cb29-837"><a href="#cb29-837" aria-hidden="true" tabindex="-1"></a>&amp;\quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \space \space \text{where a, b, and c are any numbers}</span>
+<span id="cb29-838"><a href="#cb29-838" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \frac{1}{n}\sum^{n}_{i=1} <span class="co">[</span><span class="ot">(y_i - \bar{y})^2 + 2(y_i - \bar{y})(\bar{y} - \theta) + (\bar{y} - \theta)^2</span><span class="co">]</span></span>
+<span id="cb29-839"><a href="#cb29-839" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \frac{1}{n}<span class="co">[</span><span class="ot">\sum^{n}_{i=1}(y_i - \bar{y})^2 + 2(\bar{y} - \theta)\sum^{n}_{i=1}(y_i - \bar{y}) + n(\bar{y} - \theta)^2</span><span class="co">]</span> \quad \quad  \text{distribute sum to individual terms}</span>
+<span id="cb29-840"><a href="#cb29-840" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \frac{1}{n}\sum^{n}_{i=1}(y_i - \bar{y})^2 + \frac{2}{n}(\bar{y} - \theta)\cdot0 + (\bar{y} - \theta)^2 \quad \quad  \text{sum of deviations from mean is 0}</span>
+<span id="cb29-841"><a href="#cb29-841" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span> &amp;= \sigma_y^2 + (\bar{y} - \theta)^2</span>
+<span id="cb29-842"><a href="#cb29-842" aria-hidden="true" tabindex="-1"></a>\end{align}</span>
+<span id="cb29-843"><a href="#cb29-843" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb29-844"><a href="#cb29-844" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-845"><a href="#cb29-845" aria-hidden="true" tabindex="-1"></a>Since variance can't be negative, we know that our first term, $\sigma_y^2$ is greater than or equal to $0$. Also note, that **the first term doesn't involve $\theta$ at all**, meaning changing our model won't change this value. For the purposes of determining $\hat{\theta}#, we can then essentially ignore this term.</span>
+<span id="cb29-846"><a href="#cb29-846" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-847"><a href="#cb29-847" aria-hidden="true" tabindex="-1"></a>Looking at the second term, $(\bar{y} - \theta)^2$, since it is squared, we know it must be greater than or equal to $0$. As this term does involve $\theta$, picking the value of $\theta$ that minimizes this term will allow us to minimize our average loss. For the second term to equal $0$, $\theta = \bar{y}$, or in other words, $\hat{\theta} = \bar{y} = mean(y)$.</span>
+<span id="cb29-848"><a href="#cb29-848" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-849"><a href="#cb29-849" aria-hidden="true" tabindex="-1"></a><span class="fu">##### Note</span></span>
+<span id="cb29-850"><a href="#cb29-850" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-851"><a href="#cb29-851" aria-hidden="true" tabindex="-1"></a>In the derivation above, we decompose the expected loss, $R(\theta)$, into two key components: the variance of the data, $\sigma_y^2$, and the square of the bias, $(\bar{y} - \theta)^2$. This decomposition is insightful for understanding the behavior of estimators in statistical models.</span>
+<span id="cb29-852"><a href="#cb29-852" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-853"><a href="#cb29-853" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>**Variance, $\sigma_y^2$**: This term represents the spread of the data points around their mean, $\bar{y}$, and is a measure of the data's inherent variability. Importantly, it does not depend on the choice of $\theta$, meaning it's a fixed property of the data. Variance serves as an indicator of the data's dispersion and is crucial in understanding the dataset's structure, but it remains constant regardless of how we adjust our model parameter $\theta$.</span>
+<span id="cb29-854"><a href="#cb29-854" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-855"><a href="#cb29-855" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>**Bias Squared, $(\bar{y} - \theta)^2$**: This term captures the bias of the estimator, defined as the square of the difference between the mean of the data points, $\bar{y}$, and the parameter $\theta$. The bias quantifies the systematic error introduced when estimating $\theta$. Minimizing this term is essential for improving the accuracy of the estimator. When $\theta = \bar{y}$, the bias is $0$, indicating that the estimator is unbiased for the parameter it estimates. This highlights a critical principle in statistical estimation: choosing $\theta$ to be the sample mean, $\bar{y}$, minimizes the average loss, rendering the estimator both efficient and unbiased for the population mean.</span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-10-output-2.png b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-10-output-2.png
new file mode 100644
index 000000000..1acc1baca
Binary files /dev/null and b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-10-output-2.png differ
diff --git a/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-12-output-1.png b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-12-output-1.png
new file mode 100644
index 000000000..67779b495
Binary files /dev/null and b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-12-output-1.png differ
diff --git a/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-13-output-1.png b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-13-output-1.png
new file mode 100644
index 000000000..8e3636962
Binary files /dev/null and b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-13-output-1.png differ
diff --git a/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-18-output-2.png b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-18-output-2.png
new file mode 100644
index 000000000..2a722a072
Binary files /dev/null and b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-18-output-2.png differ
diff --git a/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-19-output-1.png b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-19-output-1.png
new file mode 100644
index 000000000..e7ad12475
Binary files /dev/null and b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-19-output-1.png differ
diff --git a/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-20-output-2.png b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-20-output-2.png
new file mode 100644
index 000000000..0407c94e2
Binary files /dev/null and b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-20-output-2.png differ
diff --git a/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-5-output-1.png b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-5-output-1.png
new file mode 100644
index 000000000..3b44a8d7c
Binary files /dev/null and b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-5-output-1.png differ
diff --git a/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-7-output-1.png b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-7-output-1.png
new file mode 100644
index 000000000..87f9e7be6
Binary files /dev/null and b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-7-output-1.png differ
diff --git a/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-9-output-1.png b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-9-output-1.png
new file mode 100644
index 000000000..2b9d23263
Binary files /dev/null and b/docs/constant_model_loss_transformations/loss_transformations_files/figure-html/cell-9-output-1.png differ
diff --git a/docs/cv_regularization/cv_reg.html b/docs/cv_regularization/cv_reg.html
new file mode 100644
index 000000000..8cc3b5a2d
--- /dev/null
+++ b/docs/cv_regularization/cv_reg.html
@@ -0,0 +1,1313 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>16&nbsp; Cross Validation and Regularization – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../probability_1/probability_1.html" rel="next">
+<link href="../case_study_HCE/case_study_HCE.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../cv_regularization/cv_reg.html"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#cross-validation" id="toc-cross-validation" class="nav-link active" data-scroll-target="#cross-validation"><span class="header-section-number">16.1</span> Cross-validation</a>
+  <ul>
+  <li><a href="#training-test-and-validation-sets" id="toc-training-test-and-validation-sets" class="nav-link" data-scroll-target="#training-test-and-validation-sets"><span class="header-section-number">16.1.1</span> Training, Test, and Validation Sets</a>
+  <ul>
+  <li><a href="#test-sets" id="toc-test-sets" class="nav-link" data-scroll-target="#test-sets"><span class="header-section-number">16.1.1.1</span> Test Sets</a></li>
+  <li><a href="#validation-sets" id="toc-validation-sets" class="nav-link" data-scroll-target="#validation-sets"><span class="header-section-number">16.1.1.2</span> Validation Sets</a></li>
+  </ul></li>
+  <li><a href="#k-fold-cross-validation" id="toc-k-fold-cross-validation" class="nav-link" data-scroll-target="#k-fold-cross-validation"><span class="header-section-number">16.1.2</span> K-Fold Cross-Validation</a></li>
+  <li><a href="#model-selection-workflow" id="toc-model-selection-workflow" class="nav-link" data-scroll-target="#model-selection-workflow"><span class="header-section-number">16.1.3</span> Model Selection Workflow</a></li>
+  <li><a href="#hyperparameters" id="toc-hyperparameters" class="nav-link" data-scroll-target="#hyperparameters"><span class="header-section-number">16.1.4</span> Hyperparameters</a></li>
+  </ul></li>
+  <li><a href="#regularization" id="toc-regularization" class="nav-link" data-scroll-target="#regularization"><span class="header-section-number">16.2</span> Regularization</a>
+  <ul>
+  <li><a href="#constraining-model-parameters" id="toc-constraining-model-parameters" class="nav-link" data-scroll-target="#constraining-model-parameters"><span class="header-section-number">16.2.1</span> Constraining Model Parameters</a></li>
+  <li><a href="#l1-lasso-regularization" id="toc-l1-lasso-regularization" class="nav-link" data-scroll-target="#l1-lasso-regularization"><span class="header-section-number">16.2.2</span> L1 (LASSO) Regularization</a></li>
+  <li><a href="#scaling-features-for-regularization" id="toc-scaling-features-for-regularization" class="nav-link" data-scroll-target="#scaling-features-for-regularization"><span class="header-section-number">16.2.3</span> Scaling Features for Regularization</a></li>
+  <li><a href="#l2-ridge-regularization" id="toc-l2-ridge-regularization" class="nav-link" data-scroll-target="#l2-ridge-regularization"><span class="header-section-number">16.2.4</span> L2 (Ridge) Regularization</a></li>
+  </ul></li>
+  <li><a href="#regression-summary" id="toc-regression-summary" class="nav-link" data-scroll-target="#regression-summary"><span class="header-section-number">16.3</span> Regression Summary</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Recognize the need for validation and test sets to preview model performance on unseen data</li>
+<li>Apply cross-validation to select model hyperparameters</li>
+<li>Understand the conceptual basis for L1 and L2 regularization</li>
+</ul>
+</div>
+</div>
+</div>
+<p>At the end of the Feature Engineering lecture (Lecture 14), we arrived at the issue of fine-tuning model complexity. We identified that a model that’s too complex can lead to overfitting while a model that’s too simple can lead to underfitting. This brings us to a natural question: how do we control model complexity to avoid under- and overfitting?</p>
+<p>To answer this question, we will need to address two things: first, we need to understand <em>when</em> our model begins to overfit by assessing its performance on unseen data. We can achieve this through <strong>cross-validation</strong>. Secondly, we need to introduce a technique to adjust the complexity of our models ourselves – to do so, we will apply <strong>regularization</strong>.</p>
+<section id="cross-validation" class="level2" data-number="16.1">
+<h2 data-number="16.1" class="anchored" data-anchor-id="cross-validation"><span class="header-section-number">16.1</span> Cross-validation</h2>
+<section id="training-test-and-validation-sets" class="level3" data-number="16.1.1">
+<h3 data-number="16.1.1" class="anchored" data-anchor-id="training-test-and-validation-sets"><span class="header-section-number">16.1.1</span> Training, Test, and Validation Sets</h3>
+<center>
+<img src="images/simple_under_overfit.png" alt="train-test-split" width="400">
+</center>
+<p><br></p>
+<p>From the last lecture, we learned that <em>increasing</em> model complexity <em>decreased</em> our model’s training error but <em>increased</em> its variance. This makes intuitive sense: adding more features causes our model to fit more closely to data it encountered during training, but it generalizes worse to new data that hasn’t been seen before. For this reason, a low training error is not always representative of our model’s underlying performance – we need to also assess how well it performs on unseen data to ensure that it is not overfitting.</p>
+<p>Truly, the only way to know when our model overfits is by evaluating it on unseen data. Unfortunately, that means we need to wait for more data. This may be very expensive and time-consuming.</p>
+<p>How should we proceed? In this section, we will build up a viable solution to this problem.</p>
+<section id="test-sets" class="level4" data-number="16.1.1.1">
+<h4 data-number="16.1.1.1" class="anchored" data-anchor-id="test-sets"><span class="header-section-number">16.1.1.1</span> Test Sets</h4>
+<p>The simplest approach to avoid overfitting is to keep some of our data “secret” from ourselves. We can set aside a random portion of our full dataset to use <em>only</em> for testing purposes. The datapoints in this <strong>test set</strong> will <em>not</em> be used to fit the model. Instead, we will:</p>
+<ul>
+<li>Use the remaining portion of our dataset – now called the <strong>training set</strong> – to run ordinary least squares, gradient descent, or some other technique to train our model,</li>
+<li>Take the fitted model and use it to make predictions on datapoints in the test set. The model’s performance on the test set (expressed as the MSE, RMSE, etc.) is now indicative of how well it can make predictions on <em>unseen</em> data</li>
+</ul>
+<p>Importantly, the optimal model parameters were found by <em>only</em> considering the data in the training set. After the model has been fitted to the training data, we do not change any parameters before making predictions on the test set. Importantly, we only ever make predictions on the test set <strong>once</strong> after all model design has been completely finalized. We treat the test set performance as the final test of how well a model does. To reiterate, the test set is only ever touched once: to compute the performance of the model after all fine-tuning has been completed.</p>
+<p>The process of sub-dividing our dataset into training and test sets is known as a <strong>train-test split</strong>. Typically, between 10% and 20% of the data is allocated to the test set.</p>
+<center>
+<img src="images/train-test-split.png" alt="train-test-split" width="500">
+</center>
+<p><br></p>
+<p>In <code>sklearn</code>, the <code>train_test_split</code> function (<a href="https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html">documentation</a>) of the <code>model_selection</code> module allows us to automatically generate train-test splits.</p>
+<p>We will work with the <code>vehicles</code> dataset from previous lectures. As before, we will attempt to predict the <code>mpg</code> of a vehicle from transformations of its <code>hp</code>. In the cell below, we allocate 20% of the full dataset to testing, and the remaining 80% to training.</p>
+<div id="9e8f4b2a" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> warnings</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>warnings.filterwarnings(<span class="st">'ignore'</span>)</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Load the dataset and construct the design matrix</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a>vehicles <span class="op">=</span> sns.load_dataset(<span class="st">"mpg"</span>).rename(columns<span class="op">=</span>{<span class="st">"horsepower"</span>:<span class="st">"hp"</span>}).dropna()</span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> vehicles[[<span class="st">"hp"</span>]]</span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>X[<span class="st">"hp^2"</span>] <span class="op">=</span> vehicles[<span class="st">"hp"</span>]<span class="op">**</span><span class="dv">2</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a>X[<span class="st">"hp^3"</span>] <span class="op">=</span> vehicles[<span class="st">"hp"</span>]<span class="op">**</span><span class="dv">3</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a>X[<span class="st">"hp^4"</span>] <span class="op">=</span> vehicles[<span class="st">"hp"</span>]<span class="op">**</span><span class="dv">4</span></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> vehicles[<span class="st">"mpg"</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="6b826f72" class="cell" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> train_test_split</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co"># `test_size` specifies the proportion of the full dataset that should be allocated to testing</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># `random_state` makes our results reproducible for educational purposes</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>X_train, X_test, Y_train, Y_test <span class="op">=</span> train_test_split(</span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>        X, </span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>        Y, </span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>        test_size<span class="op">=</span><span class="fl">0.2</span>, </span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>        random_state<span class="op">=</span><span class="dv">220</span></span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Size of full dataset: </span><span class="sc">{</span>X<span class="sc">.</span>shape[<span class="dv">0</span>]<span class="sc">}</span><span class="ss"> points"</span>)</span>
+<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Size of training set: </span><span class="sc">{</span>X_train<span class="sc">.</span>shape[<span class="dv">0</span>]<span class="sc">}</span><span class="ss"> points"</span>)</span>
+<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Size of test set: </span><span class="sc">{</span>X_test<span class="sc">.</span>shape[<span class="dv">0</span>]<span class="sc">}</span><span class="ss"> points"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>Size of full dataset: 392 points
+Size of training set: 313 points
+Size of test set: 79 points</code></pre>
+</div>
+</div>
+<p>After performing our train-test split, we fit a model to the training set and assess its performance on the test set.</p>
+<div id="13fd8ee4" class="cell" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.linear_model <span class="im">as</span> lm</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> mean_squared_error</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> lm.LinearRegression()</span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit to the training set</span></span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>model.fit(X_train, Y_train)</span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Calculate errors</span></span>
+<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>train_error <span class="op">=</span> mean_squared_error(Y_train, model.predict(X_train))</span>
+<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>test_error <span class="op">=</span> mean_squared_error(Y_test, model.predict(X_test))</span>
+<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Training error: </span><span class="sc">{</span>train_error<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Test error: </span><span class="sc">{</span>test_error<span class="sc">}</span><span class="ss">"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>Training error: 17.858516841012097
+Test error: 23.19240562932651</code></pre>
+</div>
+</div>
+</section>
+<section id="validation-sets" class="level4" data-number="16.1.1.2">
+<h4 data-number="16.1.1.2" class="anchored" data-anchor-id="validation-sets"><span class="header-section-number">16.1.1.2</span> Validation Sets</h4>
+<p>Now, what if we were dissatisfied with our test set performance? With our current framework, we’d be stuck. As outlined previously, assessing model performance on the test set is the <em>final</em> stage of the model design process; we can’t go back and adjust our model based on the new discovery that it is overfitting. If we did, then we would be <em>factoring in information from the test set</em> to design our model. The test error would no longer be a true representation of the model’s performance on <em>unseen</em> data!</p>
+<p>Our solution is to introduce a <strong>validation set</strong>. A validation set is a random portion of the <em>training set</em> that is set aside for assessing model performance while the model is <em>still being developed</em>. The process for using a validation set is:</p>
+<ol type="1">
+<li>Perform a train-test split.</li>
+<li>Set the test set aside; we will not touch it until the very end of the model design process.</li>
+<li>Set aside a portion of the training set to be used for validation.</li>
+<li>Fit the model parameters to the datapoints contained in the remaining portion of the training set.</li>
+<li>Assess the model’s performance on the validation set. Adjust the model as needed, re-fit it to the remaining portion of the training set, then re-evaluate it on the validation set. Repeat as necessary until you are satisfied.</li>
+<li>After <em>all</em> model development is complete, assess the model’s performance on the test set. This is the final test of how well the model performs on unseen data. No further modifications should be made to the model.</li>
+</ol>
+<p>The process of creating a validation set is called a <strong>validation split</strong>.</p>
+<center>
+<img src="images/validation-split.png" alt="validation-split" width="600">
+</center>
+<p><br></p>
+<p>Note that the validation error behaves quite differently from the training error explored previously. As the model becomes more complex, it makes better predictions on the training data; the variance of the model typically increases as model complexity increases. Validation error, on the other hand, decreases <em>then increases</em> as we increase model complexity. This reflects the transition from under- to overfitting: at low model complexity, the model underfits because it is not complex enough to capture the main trends in the data; at high model complexity, the model overfits because it “memorizes” the training data too closely.</p>
+<p>We can update our understanding of the relationships between error, complexity, and model variance:</p>
+<center>
+<img src="images/training_validation_curve.png" alt="training_validation_curve" width="500">
+</center>
+<p><br></p>
+<p>Our goal is to train a model with complexity near the orange dotted line – this is where our model minimizes the validation error. Note that this relationship is a simplification of the real-world, but it’s a good enough approximation for the purposes of Data 100.</p>
+</section>
+</section>
+<section id="k-fold-cross-validation" class="level3" data-number="16.1.2">
+<h3 data-number="16.1.2" class="anchored" data-anchor-id="k-fold-cross-validation"><span class="header-section-number">16.1.2</span> K-Fold Cross-Validation</h3>
+<p>Introducing a validation set gave us an “extra” chance to assess model performance on another set of unseen data. We are able to finetune the model design based on its performance on this one set of validation data.</p>
+<p>But what if, by random chance, our validation set just happened to contain many outliers? It is possible that the validation datapoints we set aside do not actually represent other unseen data that the model might encounter. Ideally, we would like to validate our model’s performance on several different unseen datasets. This would give us greater confidence in our understanding of how the model behaves on new data.</p>
+<p>Let’s think back to our validation framework. Earlier, we set aside <span class="math inline">\(x\)</span>% of our training data (say, 20%) to use for validation.</p>
+<center>
+<img src="images/validation_set.png" alt="validation_set" width="220">
+</center>
+<p>In the example above, we set aside the first 20% of training datapoints for the validation set. This was an arbitrary choice. We could have set aside <em>any</em> 20% portion of the training data for validation. In fact, there are 5 non-overlapping “chunks” of training points that we could have designated as the validation set.</p>
+<center>
+<img src="images/possible_validation_sets.png" alt="possible_validation_sets" width="750">
+</center>
+<p>The common term for one of these chunks is a <strong>fold</strong>. In the example above, we had 5 folds, each containing 20% of the training data. This gives us a new perspective: we really have <em>5</em> validation sets “hidden” in our training set.</p>
+<p>In <strong>cross-validation</strong>, we perform validation splits for each fold in the training set. For a dataset with <span class="math inline">\(K\)</span> folds, we:</p>
+<ol type="1">
+<li>Pick one fold to be the validation fold</li>
+<li>Fit the model to training data from every fold <em>other</em> than the validation fold</li>
+<li>Compute the model’s error on the validation fold and record it</li>
+<li>Repeat for all <span class="math inline">\(K\)</span> folds</li>
+</ol>
+The <strong>cross-validation error</strong> is then the <em>average</em> error across all <span class="math inline">\(K\)</span> validation folds. In the example below, the cross-validation error is the mean of validation errors #1 to #5.
+<center>
+<img src="images/cross_validation.png" alt="cross_validation" width="800">
+</center>
+</section>
+<section id="model-selection-workflow" class="level3" data-number="16.1.3">
+<h3 data-number="16.1.3" class="anchored" data-anchor-id="model-selection-workflow"><span class="header-section-number">16.1.3</span> Model Selection Workflow</h3>
+<p>At this stage, we have refined our model selection workflow. We begin by performing a train-test split to set aside a test set for the final evaluation of model performance. Then, we alternate between adjusting our design matrix and computing the cross-validation error to finetune the model’s design. In the example below, we illustrate the use of 4-fold cross-validation to help inform model design.</p>
+<center>
+<img src="images/model_selection.png" alt="model_selection" width="800">
+</center>
+</section>
+<section id="hyperparameters" class="level3" data-number="16.1.4">
+<h3 data-number="16.1.4" class="anchored" data-anchor-id="hyperparameters"><span class="header-section-number">16.1.4</span> Hyperparameters</h3>
+<p>An important use of cross-validation is for <strong>hyperparameter</strong> selection. A hyperparameter is some value in a model that is chosen <em>before</em> the model is fit to any data. This means that it is distinct from the <em>model parameters</em>, <span class="math inline">\(\theta_i\)</span>, because its value is selected <em>before</em> the training process begins. We cannot use our usual techniques – calculus, ordinary least squares, or gradient descent – to choose its value. Instead, we must decide it ourselves.</p>
+<p>Some examples of hyperparameters in Data 100 are:</p>
+<ul>
+<li>The degree of our polynomial model (recall that we selected the degree before creating our design matrix and calling <code>.fit</code>)</li>
+<li>The learning rate, <span class="math inline">\(\alpha\)</span>, in gradient descent</li>
+<li>The regularization penalty, <span class="math inline">\(\lambda\)</span> (to be introduced later this lecture)</li>
+</ul>
+<p>To select a hyperparameter value via cross-validation, we first list out several “guesses” for what the best hyperparameter may be. For each guess, we then run cross-validation to compute the cross-validation error incurred by the model when using that choice of hyperparameter value. We then select the value of the hyperparameter that resulted in the lowest cross-validation error.</p>
+<p>For example, we may wish to use cross-validation to decide what value we should use for <span class="math inline">\(\alpha\)</span>, which controls the step size of each gradient descent update. To do so, we list out some possible guesses for the best <span class="math inline">\(\alpha\)</span>, like 0.1, 1, and 10. For each possible value, we perform cross-validation to see what error the model has when we use that value of <span class="math inline">\(\alpha\)</span> to train it.</p>
+<center>
+<img src="images/hyperparameter_tuning.png" alt="hyperparameter_tuning" width="600">
+</center>
+</section>
+</section>
+<section id="regularization" class="level2" data-number="16.2">
+<h2 data-number="16.2" class="anchored" data-anchor-id="regularization"><span class="header-section-number">16.2</span> Regularization</h2>
+<p>We’ve now addressed the first of our two goals for today: creating a framework to assess model performance on unseen data. Now, we’ll discuss our second objective: developing a technique to adjust model complexity. This will allow us to directly tackle the issues of under- and overfitting.</p>
+<p>Earlier, we adjusted the complexity of our polynomial model by tuning a hyperparameter – the degree of the polynomial. We tested out several different polynomial degrees, computed the validation error for each, and selected the value that minimized the validation error. Tweaking the “complexity” was simple; it was only a matter of adjusting the polynomial degree.</p>
+<p>In most machine learning problems, complexity is defined differently from what we have seen so far. Today, we’ll explore two different definitions of complexity: the <em>squared</em> and <em>absolute</em> magnitude of <span class="math inline">\(\theta_i\)</span> coefficients.</p>
+<section id="constraining-model-parameters" class="level3" data-number="16.2.1">
+<h3 data-number="16.2.1" class="anchored" data-anchor-id="constraining-model-parameters"><span class="header-section-number">16.2.1</span> Constraining Model Parameters</h3>
+<p>Think back to our work using gradient descent to descend down a loss surface. You may find it helpful to refer back to the Gradient Descent note to refresh your memory. Our aim was to find the combination of model parameters that the smallest, minimum loss. We visualized this using a contour map by plotting possible parameter values on the horizontal and vertical axes, which allows us to take a bird’s eye view above the loss surface. Notice that the contour map has <span class="math inline">\(p=2\)</span> parameters for ease of visualization. We want to find the model parameters corresponding to the lowest point on the loss surface.</p>
+<center>
+<img src="images/unconstrained.png" alt="unconstrained" width="450">
+</center>
+<p>Let’s review our current modeling framework.</p>
+<p><span class="math display">\[\hat{\mathbb{Y}} = \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2 + \ldots + \theta_p \phi_p\]</span></p>
+<p>Recall that we represent our features with <span class="math inline">\(\phi_i\)</span> to reflect the fact that we have performed feature engineering.</p>
+<p>Previously, we restricted model complexity by limiting the total number of features present in the model. We only included a limited number of polynomial features at a time; all other polynomials were excluded from the model.</p>
+<p>What if, instead of fully removing particular features, we kept all features and used each one only a “little bit”? If we put a limit on how <em>much</em> each feature can contribute to the predictions, we can still control the model’s complexity without the need to manually determine how many features should be removed.</p>
+<p>What do we mean by a “little bit”? Consider the case where some parameter <span class="math inline">\(\theta_i\)</span> is close to or equal to 0. Then, feature <span class="math inline">\(\phi_i\)</span> barely impacts the prediction – the feature is weighted by such a small value that its presence doesn’t significantly change the value of <span class="math inline">\(\hat{\mathbb{Y}}\)</span>. If we restrict how large each parameter <span class="math inline">\(\theta_i\)</span> can be, we restrict how much feature <span class="math inline">\(\phi_i\)</span> contributes to the model. This has the effect of <em>reducing</em> model complexity.</p>
+<p>In <strong>regularization</strong>, we restrict model complexity by putting a limit on the <em>magnitudes</em> of the model parameters <span class="math inline">\(\theta_i\)</span>.</p>
+<p>What do these limits look like? Suppose we specify that the sum of all absolute parameter values can be no greater than some number <span class="math inline">\(Q\)</span>. In other words:</p>
+<p><span class="math display">\[\sum_{i=1}^p |\theta_i| \leq Q\]</span></p>
+<p>where <span class="math inline">\(p\)</span> is the total number of parameters in the model. You can think of this as us giving our model a “budget” for how it distributes the magnitudes of each parameter. If the model assigns a large value to some <span class="math inline">\(\theta_i\)</span>, it may have to assign a small value to some other <span class="math inline">\(\theta_j\)</span>. This has the effect of increasing feature <span class="math inline">\(\phi_i\)</span>’s influence on the predictions while decreasing the influence of feature <span class="math inline">\(\phi_j\)</span>. The model will need to be strategic about how the parameter weights are distributed – ideally, more “important” features will receive greater weighting.</p>
+<p>Notice that the intercept term, <span class="math inline">\(\theta_0\)</span>, is excluded from this constraint. <strong>We typically do not regularize the intercept term</strong>.</p>
+<p>Now, let’s think back to gradient descent and visualize the loss surface as a contour map. As a refresher, a loss surface means that each point represents the model’s loss for a particular combination of <span class="math inline">\(\theta_1\)</span>, <span class="math inline">\(\theta_2\)</span>. Let’s say our goal is to find the combination of parameters that gives us the lowest loss.</p>
+<center>
+<img src="images/constrained_gd.png" alt="constrained_gd" width="450">
+</center>
+<p><br> With no constraint, the optimal <span class="math inline">\(\hat{\theta}\)</span> is in the center. We denote this as <span class="math inline">\(\hat{\theta}_\text{No Reg}\)</span>.</p>
+<p>Applying this constraint limits what combinations of model parameters are valid. We can now only consider parameter combinations with a total absolute sum less than or equal to our number <span class="math inline">\(Q\)</span>. For our 2D example, the constraint <span class="math inline">\(\sum_{i=1}^p |\theta_i| \leq Q\)</span> can be rewritten as <span class="math inline">\(|\theta_0| + |\theta_1| \leq Q\)</span>. This means that we can only assign our <em>regularized</em> parameter vector <span class="math inline">\(\hat{\theta}_{\text{Reg}}\)</span> to positions in the green diamond below.</p>
+<center>
+<img src="images/diamondreg.png" alt="diamondreg" width="450">
+</center>
+<p><br> We can no longer select the parameter vector that <em>truly</em> minimizes the loss surface, <span class="math inline">\(\hat{\theta}_{\text{No Reg}}\)</span>, because this combination of parameters does not lie within our allowed region. Instead, we select whatever allowable combination brings us <em>closest</em> to the true minimum loss, which is depicted by the red point below.</p>
+<center>
+<img src="images/diamond.png" alt="diamond" width="450">
+</center>
+<p><br> Notice that, under regularization, our optimized <span class="math inline">\(\theta_1\)</span> and <span class="math inline">\(\theta_2\)</span> values are much smaller than they were without regularization (indeed, <span class="math inline">\(\theta_1\)</span> has decreased to 0). The model has <em>decreased in complexity</em> because we have limited how much our features contribute to the model. In fact, by setting its parameter to 0, we have effectively removed the influence of feature <span class="math inline">\(\phi_1\)</span> from the model altogether.</p>
+<p>If we change the value of <span class="math inline">\(Q\)</span>, we change the region of allowed parameter combinations. The model will still choose the combination of parameters that produces the lowest loss – the closest point in the constrained region to the true minimizer, <span class="math inline">\(\hat{\theta}_{\text{No Reg}}\)</span>.</p>
+<p>When <span class="math inline">\(Q\)</span> is small, we severely restrict the size of our parameters. <span class="math inline">\(\theta_i\)</span>s are small in value, and features <span class="math inline">\(\phi_i\)</span> only contribute a little to the model. The allowed region of model parameters contracts, and the model becomes much simpler:</p>
+<center>
+<img src="images/diamondpoint.png" alt="diamondpoint" width="450">
+</center>
+<p><br></p>
+<p>When <span class="math inline">\(Q\)</span> is large, we do not restrict our parameter sizes by much. <span class="math inline">\(\theta_i\)</span>s are large in value, and features <span class="math inline">\(\phi_i\)</span> contribute more to the model. The allowed region of model parameters expands, and the model becomes more complex:</p>
+<center>
+<img src="images/largerq.png" alt="largerq" width="450">
+</center>
+<p><br></p>
+<p>Consider the extreme case of when <span class="math inline">\(Q\)</span> is extremely large. In this situation, our restriction has essentially no effect, and the allowed region includes the OLS solution!</p>
+<center>
+<img src="images/verylarge.png" alt="verylarge" width="450">
+</center>
+<p><br></p>
+<p>Now what if <span class="math inline">\(Q\)</span> was extremely small? Most parameters are then set to (essentially) 0.</p>
+<ul>
+<li>If the model has no intercept term: <span class="math inline">\(\hat{\mathbb{Y}} = (0)\phi_1 + (0)\phi_2 + \ldots = 0\)</span>.</li>
+<li>If the model has an intercept term: <span class="math inline">\(\hat{\mathbb{Y}} = (0)\phi_1 + (0)\phi_2 + \ldots = \theta_0\)</span>. Remember that the intercept term is excluded from the constraint - this is so we avoid the situation where we always predict 0.</li>
+</ul>
+<p>Let’s summarize what we have seen.</p>
+<center>
+<img src="images/summary.png" alt="summary" width="700">
+</center>
+</section>
+<section id="l1-lasso-regularization" class="level3" data-number="16.2.2">
+<h3 data-number="16.2.2" class="anchored" data-anchor-id="l1-lasso-regularization"><span class="header-section-number">16.2.2</span> L1 (LASSO) Regularization</h3>
+<p>How do we actually apply our constraint <span class="math inline">\(\sum_{i=1}^p |\theta_i| \leq Q\)</span>? We will do so by modifying the <em>objective function</em> that we seek to minimize when fitting a model.</p>
+<p>Recall our ordinary least squares objective function: our goal was to find parameters that minimize the model’s mean squared error:</p>
+<p><span class="math display">\[\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2 = \frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2\]</span></p>
+<p>To apply our constraint, we need to rephrase our minimization goal as:</p>
+<p><span class="math display">\[\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2\:\text{such that} \sum_{i=1}^p |\theta_i| \leq Q\]</span></p>
+<p>Unfortunately, we can’t directly use this formulation as our objective function – it’s not easy to mathematically optimize over a constraint. Instead, we will apply the magic of the <a href="https://en.wikipedia.org/wiki/Duality_(optimization)">Lagrangian Duality</a>. The details of this are out of scope (take EECS 127 if you’re interested in learning more), but the end result is very useful. It turns out that minimizing the following <em>augmented</em> objective function is <em>equivalent</em> to our minimization goal above.</p>
+<p><span class="math display">\[\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2 + \lambda \sum_{i=1}^p \vert \theta_i \vert\]</span> <span class="math display">\[ = \frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda \sum_{i=1}^p |\theta_i|\]</span> <span class="math display">\[ = \frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda || \theta ||_1\]</span></p>
+<p>The last two expressions include the MSE expressed using vector notation, and the last expression writes <span class="math inline">\(\sum_{i=1}^p |\theta_i|\)</span> as it’s <strong>L1 norm</strong> equivalent form, <span class="math inline">\(|| \theta ||_1\)</span>.</p>
+<p>Notice that we’ve replaced the constraint with a second term in our objective function. We’re now minimizing a function with an additional regularization term that <em>penalizes large coefficients</em>. In order to minimize this new objective function, we’ll end up balancing two components:</p>
+<ol type="1">
+<li>Keeping the model’s error on the training data low, represented by the term <span class="math inline">\(\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 x_{i, 1} + \theta_2 x_{i, 2} + \ldots + \theta_p x_{i, p}))^2\)</span></li>
+<li>Keeping the magnitudes of model parameters low, represented by the term <span class="math inline">\(\lambda \sum_{i=1}^p |\theta_i|\)</span></li>
+</ol>
+<p>The <span class="math inline">\(\lambda\)</span> factor controls the degree of regularization. Roughly speaking, <span class="math inline">\(\lambda\)</span> is related to our <span class="math inline">\(Q\)</span> constraint from before by the rule <span class="math inline">\(\lambda \approx \frac{1}{Q}\)</span>. To understand why, let’s consider two extreme examples. Recall that our goal is to minimize the cost function: <span class="math inline">\(\frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda || \theta ||_1\)</span>.</p>
+<ul>
+<li><p>Assume <span class="math inline">\(\lambda \rightarrow \infty\)</span>. Then, <span class="math inline">\(\lambda || \theta ||_1\)</span> dominates the cost function. In order to neutralize the <span class="math inline">\(\infty\)</span> and minimize this term, we set <span class="math inline">\(\theta_j = 0\)</span> for all <span class="math inline">\(j \ge 1\)</span>. This is a very constrained model that is mathematically equivalent to the constant model <!--, which also arises when $Q$ approaches $0$. --></p></li>
+<li><p>Assume <span class="math inline">\(\lambda \rightarrow 0\)</span>. Then, <span class="math inline">\(\lambda || \theta ||_1=0\)</span>. Minimizing the cost function is equivalent to minimizing <span class="math inline">\(\frac{1}{n} || Y - X\theta ||_2^2\)</span>, our usual MSE loss function. The act of minimizing MSE loss is just our familiar OLS, and the optimal solution is the global minimum <span class="math inline">\(\hat{\theta} = \hat\theta_{No Reg.}\)</span>. <!-- We showed that the global optimum is achieved when the L2 norm ball radius $Q \rightarrow \infty$. --></p></li>
+</ul>
+<p>We call <span class="math inline">\(\lambda\)</span> the <strong>regularization penalty hyperparameter</strong>; it needs to be determined <em>prior</em> to training the model, so we must find the best value via cross-validation.</p>
+<p>The process of finding the optimal <span class="math inline">\(\hat{\theta}\)</span> to minimize our new objective function is called <strong>L1 regularization</strong>. It is also sometimes known by the acronym “LASSO”, which stands for “Least Absolute Shrinkage and Selection Operator.”</p>
+<p>Unlike ordinary least squares, which can be solved via the closed-form solution <span class="math inline">\(\hat{\theta}_{OLS} = (\mathbb{X}^{\top}\mathbb{X})^{-1}\mathbb{X}^{\top}\mathbb{Y}\)</span>, <strong>there is no closed-form solution for the optimal parameter vector under L1 regularization</strong>. Instead, we use the <code>Lasso</code> model class of <code>sklearn</code>.</p>
+<div id="0371d617" class="cell" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.linear_model <span class="im">as</span> lm</span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="co"># The alpha parameter represents our lambda term</span></span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>lasso_model <span class="op">=</span> lm.Lasso(alpha<span class="op">=</span><span class="dv">2</span>)</span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>lasso_model.fit(X_train, Y_train)</span>
+<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>lasso_model.coef_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<pre><code>array([-2.54932056e-01, -9.48597165e-04,  8.91976284e-06, -1.22872290e-08])</code></pre>
+</div>
+</div>
+<p>Notice that all model coefficients are very small in magnitude. In fact, some of them are so small that they are essentially 0. An important characteristic of L1 regularization is that many model parameters are set to 0. In other words, LASSO effectively <strong>selects only a subset</strong> of the features. The reason for this comes back to our loss surface and allowed “diamond” regions from earlier – we can often get closer to the lowest loss contour at a corner of the diamond than along an edge.</p>
+<p>When a model parameter is set to 0 or close to 0, its corresponding feature is essentially removed from the model. We say that L1 regularization performs <strong>feature selection</strong> because, by setting the parameters of unimportant features to 0, LASSO “selects” which features are more useful for modeling. L1 regularization indicates that the features with non-zero parameters are more informative for modeling than those with parameters set to zero.</p>
+</section>
+<section id="scaling-features-for-regularization" class="level3" data-number="16.2.3">
+<h3 data-number="16.2.3" class="anchored" data-anchor-id="scaling-features-for-regularization"><span class="header-section-number">16.2.3</span> Scaling Features for Regularization</h3>
+<p>The regularization procedure we just performed had one subtle issue. To see what it is, let’s take a look at the design matrix for our <code>lasso_model</code>.</p>
+<div id="5e134147" class="cell" data-execution_count="5">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>X_train.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">hp</th>
+<th data-quarto-table-cell-role="th">hp^2</th>
+<th data-quarto-table-cell-role="th">hp^3</th>
+<th data-quarto-table-cell-role="th">hp^4</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">259</td>
+<td>85.0</td>
+<td>7225.0</td>
+<td>614125.0</td>
+<td>52200625.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">129</td>
+<td>67.0</td>
+<td>4489.0</td>
+<td>300763.0</td>
+<td>20151121.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">207</td>
+<td>102.0</td>
+<td>10404.0</td>
+<td>1061208.0</td>
+<td>108243216.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">302</td>
+<td>70.0</td>
+<td>4900.0</td>
+<td>343000.0</td>
+<td>24010000.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">71</td>
+<td>97.0</td>
+<td>9409.0</td>
+<td>912673.0</td>
+<td>88529281.0</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Our features – <code>hp</code>, <code>hp^2</code>, <code>hp^3</code>, and <code>hp^4</code> – are on drastically different numeric scales! The values contained in <code>hp^4</code> are orders of magnitude larger than those contained in <code>hp</code>. This can be a problem because the value of <code>hp^4</code> will naturally contribute more to each predicted <span class="math inline">\(\hat{y}\)</span> because it is so much greater than the values of the other features. For <code>hp</code> to have much of an impact at all on the prediction, it must be scaled by a large model parameter.</p>
+<p>By inspecting the fitted parameters of our model, we see that this is the case – the parameter for <code>hp</code> is much larger in magnitude than the parameter for <code>hp^4</code>.</p>
+<div id="9b49878f" class="cell" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"Feature"</span>:X_train.columns, <span class="st">"Parameter"</span>:lasso_model.coef_})</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Feature</th>
+<th data-quarto-table-cell-role="th">Parameter</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>hp</td>
+<td>-2.549321e-01</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>hp^2</td>
+<td>-9.485972e-04</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>hp^3</td>
+<td>8.919763e-06</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>hp^4</td>
+<td>-1.228723e-08</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Recall that by applying regularization, we give our a model a “budget” for how it can allocate the values of model parameters. For <code>hp</code> to have much of an impact on each prediction, LASSO is forced to “spend” more of this budget on the parameter for <code>hp</code>.</p>
+<p>We can avoid this issue by <strong>scaling</strong> the data before regularizing. This is a process where we convert all features to the same numeric scale. A common way to scale data is to perform <strong>standardization</strong> such that all features have mean 0 and standard deviation 1; essentially, we replace everything with its Z-score.</p>
+<p><span class="math display">\[z_i = \frac{x_i - \mu}{\sigma}\]</span></p>
+</section>
+<section id="l2-ridge-regularization" class="level3" data-number="16.2.4">
+<h3 data-number="16.2.4" class="anchored" data-anchor-id="l2-ridge-regularization"><span class="header-section-number">16.2.4</span> L2 (Ridge) Regularization</h3>
+<p>In all of our work above, we considered the constraint <span class="math inline">\(\sum_{i=1}^p |\theta_i| \leq Q\)</span> to limit the complexity of the model. What if we had applied a different constraint?</p>
+<p>In <strong>L2 regularization</strong>, also known as <strong>ridge regression</strong>, we constrain the model such that the sum of the <em>squared</em> parameters must be less than some number <span class="math inline">\(Q\)</span>. This constraint takes the form:</p>
+<p><span class="math display">\[\sum_{i=1}^p \theta_i^2 \leq Q\]</span></p>
+<p>As before, <strong>we typically do not regularize the intercept term</strong>.</p>
+<p>In our 2D example, the constraint becomes <span class="math inline">\(\theta_1^2 + \theta_2^2 \leq Q\)</span>. Can you see how this is similar to the equation for a circle, <span class="math inline">\(x^2 + y^2 = r^2\)</span>? The allowed region of parameters for a given value of <span class="math inline">\(Q\)</span> is now shaped like a ball.</p>
+<center>
+<img src="images/green_constrained_gd_sol.png" alt="green_constrained_gd_sol" width="400">
+</center>
+<p>If we modify our objective function like before, we find that our new goal is to minimize the function: <span class="math display">\[\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2\:\text{such that} \sum_{i=1}^p \theta_i^2 \leq Q\]</span></p>
+<p>Notice that all we have done is change the constraint on the model parameters. The first term in the expression, the MSE, has not changed.</p>
+<p>Using Lagrangian Duality (again, out of scope for Data 100), we can re-express our objective function as: <span class="math display">\[\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2 + \lambda \sum_{i=1}^p \theta_i^2\]</span> <span class="math display">\[= \frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda \sum_{i=1}^p \theta_i^2\]</span> <span class="math display">\[= \frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda || \theta ||_2^2\]</span></p>
+<p>The last two expressions include the MSE expressed using vector notation, and the last expression writes <span class="math inline">\(\sum_{i=1}^p \theta_i^2\)</span> as it’s <strong>L2 norm</strong> equivalent form, <span class="math inline">\(|| \theta ||_2^2\)</span>.</p>
+<p>When applying L2 regularization, our goal is to minimize this updated objective function.</p>
+<p>Unlike L1 regularization, L2 regularization <em>does</em> have a closed-form solution for the best parameter vector when regularization is applied:</p>
+<p><span class="math display">\[\hat\theta_{\text{ridge}} = (\mathbb{X}^{\top}\mathbb{X} + n\lambda I)^{-1}\mathbb{X}^{\top}\mathbb{Y}\]</span></p>
+<p>This solution exists <strong>even if <span class="math inline">\(\mathbb{X}\)</span> is not full column rank</strong>. This is a major reason why L2 regularization is often used – it can produce a solution even when there is colinearity in the features. We will discuss the concept of colinearity in a future lecture, but we will not derive this result in Data 100, as it involves a fair bit of matrix calculus.</p>
+<p>In <code>sklearn</code>, we perform L2 regularization using the <code>Ridge</code> class. It runs gradient descent to minimize the L2 objective function. Notice that we scale the data before regularizing.</p>
+<div id="9178d9ba" class="cell" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>ridge_model <span class="op">=</span> lm.Ridge(alpha<span class="op">=</span><span class="dv">1</span>) <span class="co"># alpha represents the hyperparameter lambda</span></span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>ridge_model.fit(X_train, Y_train)</span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>ridge_model.coef_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<pre><code>array([ 5.89130559e-02, -6.42445915e-03,  4.44468157e-05, -8.83981945e-08])</code></pre>
+</div>
+</div>
+</section>
+</section>
+<section id="regression-summary" class="level2" data-number="16.3">
+<h2 data-number="16.3" class="anchored" data-anchor-id="regression-summary"><span class="header-section-number">16.3</span> Regression Summary</h2>
+<p>Our regression models are summarized below. Note the objective function is what the gradient descent optimizer minimizes.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 6%">
+<col style="width: 14%">
+<col style="width: 5%">
+<col style="width: 5%">
+<col style="width: 32%">
+<col style="width: 36%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Type</th>
+<th>Model</th>
+<th>Loss</th>
+<th>Regularization</th>
+<th>Objective Function</th>
+<th>Solution</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>OLS</td>
+<td><span class="math inline">\(\hat{\mathbb{Y}} = \mathbb{X}\theta\)</span></td>
+<td>MSE</td>
+<td>None</td>
+<td><span class="math inline">\(\frac{1}{n} \|\mathbb{Y}-\mathbb{X} \theta\|^2_2\)</span></td>
+<td><span class="math inline">\(\hat{\theta}_{OLS} = (\mathbb{X}^{\top}\mathbb{X})^{-1}\mathbb{X}^{\top}\mathbb{Y}\)</span> if <span class="math inline">\(\mathbb{X}\)</span> is full column rank</td>
+</tr>
+<tr class="even">
+<td>Ridge</td>
+<td><span class="math inline">\(\hat{\mathbb{Y}} = \mathbb{X} \theta\)</span></td>
+<td>MSE</td>
+<td>L2</td>
+<td><span class="math inline">\(\frac{1}{n} \|\mathbb{Y}-\mathbb{X}\theta\|^2_2 + \lambda \sum_{i=1}^p \theta_i^2\)</span></td>
+<td><span class="math inline">\(\hat{\theta}_{ridge} = (\mathbb{X}^{\top}\mathbb{X} + n \lambda I)^{-1}\mathbb{X}^{\top}\mathbb{Y}\)</span></td>
+</tr>
+<tr class="odd">
+<td>LASSO</td>
+<td><span class="math inline">\(\hat{\mathbb{Y}} = \mathbb{X} \theta\)</span></td>
+<td>MSE</td>
+<td>L1</td>
+<td><span class="math inline">\(\frac{1}{n} \|\mathbb{Y}-\mathbb{X}\theta\|^2_2 + \lambda \sum_{i=1}^p \vert \theta_i \vert\)</span></td>
+<td>No closed form solution</td>
+</tr>
+</tbody>
+</table>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../case_study_HCE/case_study_HCE.html" class="pagination-link" aria-label="Case Study in Human Contexts and Ethics">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../probability_1/probability_1.html" class="pagination-link" aria-label="Random Variables">
+        <span class="nav-page-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/cv_regularization/images/constrained_gd.png b/docs/cv_regularization/images/constrained_gd.png
new file mode 100644
index 000000000..4eda732b7
Binary files /dev/null and b/docs/cv_regularization/images/constrained_gd.png differ
diff --git a/docs/cv_regularization/images/cross_validation.png b/docs/cv_regularization/images/cross_validation.png
new file mode 100644
index 000000000..9faee18b6
Binary files /dev/null and b/docs/cv_regularization/images/cross_validation.png differ
diff --git a/docs/cv_regularization/images/diamond.png b/docs/cv_regularization/images/diamond.png
new file mode 100644
index 000000000..cdb03a3b2
Binary files /dev/null and b/docs/cv_regularization/images/diamond.png differ
diff --git a/docs/cv_regularization/images/diamondpoint.png b/docs/cv_regularization/images/diamondpoint.png
new file mode 100644
index 000000000..2d56ec3f4
Binary files /dev/null and b/docs/cv_regularization/images/diamondpoint.png differ
diff --git a/docs/cv_regularization/images/diamondreg.png b/docs/cv_regularization/images/diamondreg.png
new file mode 100644
index 000000000..6bd703484
Binary files /dev/null and b/docs/cv_regularization/images/diamondreg.png differ
diff --git a/docs/cv_regularization/images/green_constrained_gd_sol.png b/docs/cv_regularization/images/green_constrained_gd_sol.png
new file mode 100644
index 000000000..aa481a6f4
Binary files /dev/null and b/docs/cv_regularization/images/green_constrained_gd_sol.png differ
diff --git a/docs/cv_regularization/images/hyperparameter_tuning.png b/docs/cv_regularization/images/hyperparameter_tuning.png
new file mode 100644
index 000000000..fce75441a
Binary files /dev/null and b/docs/cv_regularization/images/hyperparameter_tuning.png differ
diff --git a/docs/cv_regularization/images/largerq.png b/docs/cv_regularization/images/largerq.png
new file mode 100644
index 000000000..b0d2b7979
Binary files /dev/null and b/docs/cv_regularization/images/largerq.png differ
diff --git a/docs/cv_regularization/images/model_selection.png b/docs/cv_regularization/images/model_selection.png
new file mode 100644
index 000000000..219273867
Binary files /dev/null and b/docs/cv_regularization/images/model_selection.png differ
diff --git a/docs/cv_regularization/images/possible_validation_sets.png b/docs/cv_regularization/images/possible_validation_sets.png
new file mode 100644
index 000000000..f41f7d364
Binary files /dev/null and b/docs/cv_regularization/images/possible_validation_sets.png differ
diff --git a/docs/cv_regularization/images/simple_under_overfit.png b/docs/cv_regularization/images/simple_under_overfit.png
new file mode 100644
index 000000000..51bdffdfc
Binary files /dev/null and b/docs/cv_regularization/images/simple_under_overfit.png differ
diff --git a/docs/cv_regularization/images/summary.png b/docs/cv_regularization/images/summary.png
new file mode 100644
index 000000000..59a4ccaf7
Binary files /dev/null and b/docs/cv_regularization/images/summary.png differ
diff --git a/docs/cv_regularization/images/train-test-split.png b/docs/cv_regularization/images/train-test-split.png
new file mode 100644
index 000000000..6c9bfd0bc
Binary files /dev/null and b/docs/cv_regularization/images/train-test-split.png differ
diff --git a/docs/cv_regularization/images/training_validation_curve.png b/docs/cv_regularization/images/training_validation_curve.png
new file mode 100644
index 000000000..0f6fd9aa6
Binary files /dev/null and b/docs/cv_regularization/images/training_validation_curve.png differ
diff --git a/docs/cv_regularization/images/unconstrained.png b/docs/cv_regularization/images/unconstrained.png
new file mode 100644
index 000000000..20ad9e443
Binary files /dev/null and b/docs/cv_regularization/images/unconstrained.png differ
diff --git a/docs/cv_regularization/images/validation-split.png b/docs/cv_regularization/images/validation-split.png
new file mode 100644
index 000000000..5c8aaa3bf
Binary files /dev/null and b/docs/cv_regularization/images/validation-split.png differ
diff --git a/docs/cv_regularization/images/validation_set.png b/docs/cv_regularization/images/validation_set.png
new file mode 100644
index 000000000..7d816e7d6
Binary files /dev/null and b/docs/cv_regularization/images/validation_set.png differ
diff --git a/docs/cv_regularization/images/verylarge.png b/docs/cv_regularization/images/verylarge.png
new file mode 100644
index 000000000..b08a41efe
Binary files /dev/null and b/docs/cv_regularization/images/verylarge.png differ
diff --git a/docs/eda/eda.html b/docs/eda/eda.html
new file mode 100644
index 000000000..a427693bd
--- /dev/null
+++ b/docs/eda/eda.html
@@ -0,0 +1,6468 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>5&nbsp; Data Cleaning and EDA – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../regex/regex.html" rel="next">
+<link href="../pandas_3/pandas_3.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../eda/eda.html"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Data Cleaning and EDA</h2>
+   
+  <ul>
+  <li><a href="#structure" id="toc-structure" class="nav-link active" data-scroll-target="#structure"><span class="header-section-number">5.1</span> Structure</a>
+  <ul>
+  <li><a href="#file-formats" id="toc-file-formats" class="nav-link" data-scroll-target="#file-formats"><span class="header-section-number">5.1.1</span> File Formats</a>
+  <ul>
+  <li><a href="#csv" id="toc-csv" class="nav-link" data-scroll-target="#csv"><span class="header-section-number">5.1.1.1</span> CSV</a></li>
+  <li><a href="#tsv" id="toc-tsv" class="nav-link" data-scroll-target="#tsv"><span class="header-section-number">5.1.1.2</span> TSV</a></li>
+  <li><a href="#json" id="toc-json" class="nav-link" data-scroll-target="#json"><span class="header-section-number">5.1.1.3</span> JSON</a>
+  <ul class="collapse">
+  <li><a href="#eda-with-json-berkeley-covid-19-data" id="toc-eda-with-json-berkeley-covid-19-data" class="nav-link" data-scroll-target="#eda-with-json-berkeley-covid-19-data"><span class="header-section-number">5.1.1.3.1</span> EDA with JSON: Berkeley COVID-19 Data</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#primary-and-foreign-keys" id="toc-primary-and-foreign-keys" class="nav-link" data-scroll-target="#primary-and-foreign-keys"><span class="header-section-number">5.1.2</span> Primary and Foreign Keys</a></li>
+  <li><a href="#variable-types" id="toc-variable-types" class="nav-link" data-scroll-target="#variable-types"><span class="header-section-number">5.1.3</span> Variable Types</a></li>
+  </ul></li>
+  <li><a href="#granularity-scope-and-temporality" id="toc-granularity-scope-and-temporality" class="nav-link" data-scroll-target="#granularity-scope-and-temporality"><span class="header-section-number">5.2</span> Granularity, Scope, and Temporality</a>
+  <ul>
+  <li><a href="#granularity" id="toc-granularity" class="nav-link" data-scroll-target="#granularity"><span class="header-section-number">5.2.1</span> Granularity</a></li>
+  <li><a href="#scope" id="toc-scope" class="nav-link" data-scroll-target="#scope"><span class="header-section-number">5.2.2</span> Scope</a></li>
+  <li><a href="#temporality" id="toc-temporality" class="nav-link" data-scroll-target="#temporality"><span class="header-section-number">5.2.3</span> Temporality</a>
+  <ul>
+  <li><a href="#temporality-with-pandas-dt-accessors" id="toc-temporality-with-pandas-dt-accessors" class="nav-link" data-scroll-target="#temporality-with-pandas-dt-accessors"><span class="header-section-number">5.2.3.1</span> Temporality with <code>pandas</code>’ <code>dt</code> accessors</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#faithfulness" id="toc-faithfulness" class="nav-link" data-scroll-target="#faithfulness"><span class="header-section-number">5.3</span> Faithfulness</a>
+  <ul>
+  <li><a href="#missing-values" id="toc-missing-values" class="nav-link" data-scroll-target="#missing-values"><span class="header-section-number">5.3.1</span> Missing Values</a></li>
+  </ul></li>
+  <li><a href="#eda-demo-1-tuberculosis-in-the-united-states" id="toc-eda-demo-1-tuberculosis-in-the-united-states" class="nav-link" data-scroll-target="#eda-demo-1-tuberculosis-in-the-united-states"><span class="header-section-number">5.4</span> EDA Demo 1: Tuberculosis in the United States</a>
+  <ul>
+  <li><a href="#csvs-and-field-names" id="toc-csvs-and-field-names" class="nav-link" data-scroll-target="#csvs-and-field-names"><span class="header-section-number">5.4.1</span> CSVs and Field Names</a></li>
+  <li><a href="#record-granularity" id="toc-record-granularity" class="nav-link" data-scroll-target="#record-granularity"><span class="header-section-number">5.4.2</span> Record Granularity</a></li>
+  <li><a href="#gather-census-data" id="toc-gather-census-data" class="nav-link" data-scroll-target="#gather-census-data"><span class="header-section-number">5.4.3</span> Gather Census Data</a></li>
+  <li><a href="#joining-data-merging-dataframes" id="toc-joining-data-merging-dataframes" class="nav-link" data-scroll-target="#joining-data-merging-dataframes"><span class="header-section-number">5.4.4</span> Joining Data (Merging <code>DataFrame</code>s)</a></li>
+  <li><a href="#reproducing-data-compute-incidence" id="toc-reproducing-data-compute-incidence" class="nav-link" data-scroll-target="#reproducing-data-compute-incidence"><span class="header-section-number">5.4.5</span> Reproducing Data: Compute Incidence</a></li>
+  <li><a href="#bonus-eda-reproducing-the-reported-statistic" id="toc-bonus-eda-reproducing-the-reported-statistic" class="nav-link" data-scroll-target="#bonus-eda-reproducing-the-reported-statistic"><span class="header-section-number">5.4.6</span> Bonus EDA: Reproducing the Reported Statistic</a></li>
+  </ul></li>
+  <li><a href="#eda-demo-2-mauna-loa-co2-data-a-lesson-in-data-faithfulness" id="toc-eda-demo-2-mauna-loa-co2-data-a-lesson-in-data-faithfulness" class="nav-link" data-scroll-target="#eda-demo-2-mauna-loa-co2-data-a-lesson-in-data-faithfulness"><span class="header-section-number">5.5</span> EDA Demo 2: Mauna Loa CO<sub>2</sub> Data – A Lesson in Data Faithfulness</a>
+  <ul>
+  <li><a href="#reading-this-file-into-pandas" id="toc-reading-this-file-into-pandas" class="nav-link" data-scroll-target="#reading-this-file-into-pandas"><span class="header-section-number">5.5.1</span> Reading this file into <code>Pandas</code>?</a></li>
+  <li><a href="#exploring-variable-feature-types" id="toc-exploring-variable-feature-types" class="nav-link" data-scroll-target="#exploring-variable-feature-types"><span class="header-section-number">5.5.2</span> Exploring Variable Feature Types</a></li>
+  <li><a href="#visualizing-co2" id="toc-visualizing-co2" class="nav-link" data-scroll-target="#visualizing-co2"><span class="header-section-number">5.5.3</span> Visualizing CO<sub>2</sub></a></li>
+  <li><a href="#sanity-checks-reasoning-about-the-data" id="toc-sanity-checks-reasoning-about-the-data" class="nav-link" data-scroll-target="#sanity-checks-reasoning-about-the-data"><span class="header-section-number">5.5.4</span> Sanity Checks: Reasoning about the data</a></li>
+  <li><a href="#understanding-missing-value-1-days" id="toc-understanding-missing-value-1-days" class="nav-link" data-scroll-target="#understanding-missing-value-1-days"><span class="header-section-number">5.5.5</span> Understanding Missing Value 1: <code>Days</code></a></li>
+  <li><a href="#understanding-missing-value-2-avg" id="toc-understanding-missing-value-2-avg" class="nav-link" data-scroll-target="#understanding-missing-value-2-avg"><span class="header-section-number">5.5.6</span> Understanding Missing Value 2: <code>Avg</code></a></li>
+  <li><a href="#drop-nan-or-impute-missing-avg-data" id="toc-drop-nan-or-impute-missing-avg-data" class="nav-link" data-scroll-target="#drop-nan-or-impute-missing-avg-data"><span class="header-section-number">5.5.7</span> Drop, <code>NaN</code>, or Impute Missing <code>Avg</code> Data?</a></li>
+  <li><a href="#presenting-the-data-a-discussion-on-data-granularity" id="toc-presenting-the-data-a-discussion-on-data-granularity" class="nav-link" data-scroll-target="#presenting-the-data-a-discussion-on-data-granularity"><span class="header-section-number">5.5.8</span> Presenting the Data: A Discussion on Data Granularity</a></li>
+  </ul></li>
+  <li><a href="#summary" id="toc-summary" class="nav-link" data-scroll-target="#summary"><span class="header-section-number">5.6</span> Summary</a>
+  <ul>
+  <li><a href="#dealing-with-missing-values" id="toc-dealing-with-missing-values" class="nav-link" data-scroll-target="#dealing-with-missing-values"><span class="header-section-number">5.6.1</span> Dealing with Missing Values</a></li>
+  <li><a href="#eda-and-data-wrangling" id="toc-eda-and-data-wrangling" class="nav-link" data-scroll-target="#eda-and-data-wrangling"><span class="header-section-number">5.6.2</span> EDA and Data Wrangling</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div id="667448ed" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="co">#%matplotlib inline</span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>plt.rcParams[<span class="st">'figure.figsize'</span>] <span class="op">=</span> (<span class="dv">12</span>, <span class="dv">9</span>)</span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a>sns.<span class="bu">set</span>()</span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>sns.set_context(<span class="st">'talk'</span>)</span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a>np.set_printoptions(threshold<span class="op">=</span><span class="dv">20</span>, precision<span class="op">=</span><span class="dv">2</span>, suppress<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a>pd.set_option(<span class="st">'display.max_rows'</span>, <span class="dv">30</span>)</span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a>pd.set_option(<span class="st">'display.max_columns'</span>, <span class="va">None</span>)</span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a>pd.set_option(<span class="st">'display.precision'</span>, <span class="dv">2</span>)</span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="co"># This option stops scientific notation for pandas</span></span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a>pd.set_option(<span class="st">'display.float_format'</span>, <span class="st">'</span><span class="sc">{:.2f}</span><span class="st">'</span>.<span class="bu">format</span>)</span>
+<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a><span class="co"># Silence some spurious seaborn warnings</span></span>
+<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> warnings</span>
+<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a>warnings.filterwarnings(<span class="st">"ignore"</span>, category<span class="op">=</span><span class="pp">FutureWarning</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Recognize common file formats</li>
+<li>Categorize data by its variable type</li>
+<li>Build awareness of issues with data faithfulness and develop targeted solutions</li>
+</ul>
+</div>
+</div>
+</div>
+<p>In the past few lectures, we’ve learned that <code>pandas</code> is a toolkit to restructure, modify, and explore a dataset. What we haven’t yet touched on is <em>how</em> to make these data transformation decisions. When we receive a new set of data from the “real world,” how do we know what processing we should do to convert this data into a usable form?</p>
+<p><strong>Data cleaning</strong>, also called <strong>data wrangling</strong>, is the process of transforming raw data to facilitate subsequent analysis. It is often used to address issues like:</p>
+<ul>
+<li>Unclear structure or formatting</li>
+<li>Missing or corrupted values</li>
+<li>Unit conversions</li>
+<li>…and so on</li>
+</ul>
+<p><strong>Exploratory Data Analysis (EDA)</strong> is the process of understanding a new dataset. It is an open-ended, informal analysis that involves familiarizing ourselves with the variables present in the data, discovering potential hypotheses, and identifying possible issues with the data. This last point can often motivate further data cleaning to address any problems with the dataset’s format; because of this, EDA and data cleaning are often thought of as an “infinite loop,” with each process driving the other.</p>
+<p>In this lecture, we will consider the key properties of data to consider when performing data cleaning and EDA. In doing so, we’ll develop a “checklist” of sorts for you to consider when approaching a new dataset. Throughout this process, we’ll build a deeper understanding of this early (but very important!) stage of the data science lifecycle.</p>
+<section id="structure" class="level2" data-number="5.1">
+<h2 data-number="5.1" class="anchored" data-anchor-id="structure"><span class="header-section-number">5.1</span> Structure</h2>
+<p>We often prefer rectangular data for data analysis. Rectangular structures are easy to manipulate and analyze. A key element of data cleaning is about transforming data to be more rectangular.</p>
+<p>There are two kinds of rectangular data: tables and matrices. Tables have named columns with different data types and are manipulated using data transformation languages. Matrices contain numeric data of the same type and are manipulated using linear algebra.</p>
+<section id="file-formats" class="level3" data-number="5.1.1">
+<h3 data-number="5.1.1" class="anchored" data-anchor-id="file-formats"><span class="header-section-number">5.1.1</span> File Formats</h3>
+<p>There are many file types for storing structured data: TSV, JSON, XML, ASCII, SAS, etc. We’ll only cover CSV, TSV, and JSON in lecture, but you’ll likely encounter other formats as you work with different datasets. Reading documentation is your best bet for understanding how to process the multitude of different file types.</p>
+<section id="csv" class="level4" data-number="5.1.1.1">
+<h4 data-number="5.1.1.1" class="anchored" data-anchor-id="csv"><span class="header-section-number">5.1.1.1</span> CSV</h4>
+<p>CSVs, which stand for <strong>Comma-Separated Values</strong>, are a common tabular data format. In the past two <code>pandas</code> lectures, we briefly touched on the idea of file format: the way data is encoded in a file for storage. Specifically, our <code>elections</code> and <code>babynames</code> datasets were stored and loaded as CSVs:</p>
+<div id="1c11e808" class="cell" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>pd.read_csv(<span class="st">"data/elections.csv"</span>).head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="2">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.21</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.79</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.20</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.80</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>702735</td>
+<td>win</td>
+<td>54.57</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>To better understand the properties of a CSV, let’s take a look at the first few rows of the raw data file to see what it looks like before being loaded into a <code>DataFrame</code>. We’ll use the <code>repr()</code> function to return the raw string with its special characters:</p>
+<div id="1bb6dda0" class="cell" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/elections.csv"</span>, <span class="st">"r"</span>) <span class="im">as</span> table:</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> table:</span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="bu">repr</span>(row))</span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">3</span>:</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>'Year,Candidate,Party,Popular vote,Result,%\n'
+'1824,Andrew Jackson,Democratic-Republican,151271,loss,57.21012204\n'
+'1824,John Quincy Adams,Democratic-Republican,113142,win,42.78987796\n'
+'1828,Andrew Jackson,Democratic,642806,win,56.20392707\n'</code></pre>
+</div>
+</div>
+<p>Each row, or <strong>record</strong>, in the data is delimited by a newline <code>\n</code>. Each column, or <strong>field</strong>, in the data is delimited by a comma <code>,</code> (hence, comma-separated!).</p>
+</section>
+<section id="tsv" class="level4" data-number="5.1.1.2">
+<h4 data-number="5.1.1.2" class="anchored" data-anchor-id="tsv"><span class="header-section-number">5.1.1.2</span> TSV</h4>
+<p>Another common file type is <strong>TSV (Tab-Separated Values)</strong>. In a TSV, records are still delimited by a newline <code>\n</code>, while fields are delimited by <code>\t</code> tab character.</p>
+<p>Let’s check out the first few rows of the raw TSV file. Again, we’ll use the <code>repr()</code> function so that <code>print</code> shows the special characters.</p>
+<div id="4f6f6ca0" class="cell" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/elections.txt"</span>, <span class="st">"r"</span>) <span class="im">as</span> table:</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> table:</span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="bu">repr</span>(row))</span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">3</span>:</span>
+<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>'\ufeffYear\tCandidate\tParty\tPopular vote\tResult\t%\n'
+'1824\tAndrew Jackson\tDemocratic-Republican\t151271\tloss\t57.21012204\n'
+'1824\tJohn Quincy Adams\tDemocratic-Republican\t113142\twin\t42.78987796\n'
+'1828\tAndrew Jackson\tDemocratic\t642806\twin\t56.20392707\n'</code></pre>
+</div>
+</div>
+<p>TSVs can be loaded into <code>pandas</code> using <code>pd.read_csv</code>. We’ll need to specify the <strong>delimiter</strong> with parameter<code>sep='\t'</code> <a href="https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html">(documentation)</a>.</p>
+<div id="182eb3a0" class="cell" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>pd.read_csv(<span class="st">"data/elections.txt"</span>, sep<span class="op">=</span><span class="st">'</span><span class="ch">\t</span><span class="st">'</span>).head(<span class="dv">3</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.21</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.79</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.20</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>An issue with CSVs and TSVs comes up whenever there are commas or tabs within the records. How does <code>pandas</code> differentiate between a comma delimiter vs.&nbsp;a comma within the field itself, for example <code>8,900</code>? To remedy this, check out the <a href="https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html"><code>quotechar</code> parameter</a>.</p>
+</section>
+<section id="json" class="level4" data-number="5.1.1.3">
+<h4 data-number="5.1.1.3" class="anchored" data-anchor-id="json"><span class="header-section-number">5.1.1.3</span> JSON</h4>
+<p><strong>JSON (JavaScript Object Notation)</strong> files behave similarly to Python dictionaries. A raw JSON is shown below.</p>
+<div id="84051b7c" class="cell" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/elections.json"</span>, <span class="st">"r"</span>) <span class="im">as</span> table:</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> table:</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(row)</span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">8</span>:</span>
+<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>[
+
+ {
+
+   "Year": 1824,
+
+   "Candidate": "Andrew Jackson",
+
+   "Party": "Democratic-Republican",
+
+   "Popular vote": 151271,
+
+   "Result": "loss",
+
+   "%": 57.21012204
+
+ },
+</code></pre>
+</div>
+</div>
+<p>JSON files can be loaded into <code>pandas</code> using <code>pd.read_json</code>.</p>
+<div id="d9eacde5" class="cell" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>pd.read_json(<span class="st">'data/elections.json'</span>).head(<span class="dv">3</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.21</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.79</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.20</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<section id="eda-with-json-berkeley-covid-19-data" class="level5" data-number="5.1.1.3.1">
+<h5 data-number="5.1.1.3.1" class="anchored" data-anchor-id="eda-with-json-berkeley-covid-19-data"><span class="header-section-number">5.1.1.3.1</span> EDA with JSON: Berkeley COVID-19 Data</h5>
+<p>The City of Berkeley Open Data <a href="https://data.cityofberkeley.info/Health/COVID-19-Confirmed-Cases/xn6j-b766">website</a> has a dataset with COVID-19 Confirmed Cases among Berkeley residents by date. Let’s download the file and save it as a JSON (note the source URL file type is also a JSON). In the interest of reproducible data science, we will download the data programatically. We have defined some helper functions in the <a href="https://ds100.org/fa23/resources/assets/lectures/lec05/lec05-eda.html"><code>ds100_utils.py</code></a> file that we can reuse these helper functions in many different notebooks.</p>
+<div id="567376bb" class="cell" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> ds100_utils <span class="im">import</span> fetch_and_cache</span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>covid_file <span class="op">=</span> fetch_and_cache(</span>
+<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>    <span class="st">"https://data.cityofberkeley.info/api/views/xn6j-b766/rows.json?accessType=DOWNLOAD"</span>,</span>
+<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a>    <span class="st">"confirmed-cases.json"</span>,</span>
+<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a>    force<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a>covid_file          <span class="co"># a file path wrapper object</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>Using cached version that was downloaded (UTC): Tue Aug 27 03:33:01 2024</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<pre><code>PosixPath('data/confirmed-cases.json')</code></pre>
+</div>
+</div>
+<section id="file-size" class="level6" data-number="5.1.1.3.1.1">
+<h6 data-number="5.1.1.3.1.1" class="anchored" data-anchor-id="file-size"><span class="header-section-number">5.1.1.3.1.1</span> File Size</h6>
+<p>Let’s start our analysis by getting a rough estimate of the size of the dataset to inform the tools we use to view the data. For relatively small datasets, we can use a text editor or spreadsheet. For larger datasets, more programmatic exploration or distributed computing tools may be more fitting. Here we will use <code>Python</code> tools to probe the file.</p>
+<p>Since there seem to be text files, let’s investigate the number of lines, which often corresponds to the number of records</p>
+<div id="46100bea" class="cell" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(covid_file, <span class="st">"is"</span>, os.path.getsize(covid_file) <span class="op">/</span> <span class="fl">1e6</span>, <span class="st">"MB"</span>)</span>
+<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(covid_file, <span class="st">"r"</span>) <span class="im">as</span> f:</span>
+<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(covid_file, <span class="st">"is"</span>, <span class="bu">sum</span>(<span class="dv">1</span> <span class="cf">for</span> l <span class="kw">in</span> f), <span class="st">"lines."</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>data/confirmed-cases.json is 0.116367 MB
+data/confirmed-cases.json is 1110 lines.</code></pre>
+</div>
+</div>
+</section>
+<section id="unix-commands" class="level6" data-number="5.1.1.3.1.2">
+<h6 data-number="5.1.1.3.1.2" class="anchored" data-anchor-id="unix-commands"><span class="header-section-number">5.1.1.3.1.2</span> Unix Commands</h6>
+<p>As part of the EDA workflow, Unix commands can come in very handy. In fact, there’s an entire book called <a href="https://datascienceatthecommandline.com/">“Data Science at the Command Line”</a> that explores this idea in depth! In Jupyter/IPython, you can prefix lines with <code>!</code> to execute arbitrary Unix commands, and within those lines, you can refer to Python variables and expressions with the syntax <code>{expr}</code>.</p>
+<p>Here, we use the <code>ls</code> command to list files, using the <code>-lh</code> flags, which request “long format with information in human-readable form.” We also use the <code>wc</code> command for “word count,” but with the <code>-l</code> flag, which asks for line counts instead of words.</p>
+<p>These two give us the same information as the code above, albeit in a slightly different form:</p>
+<div id="8dbf998e" class="cell" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh {covid_file}</span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>wc <span class="op">-</span>l {covid_file}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>-rw-r--r--  1 jianingding21  staff   114K Aug 27 03:33 data/confirmed-cases.json
+    1109 data/confirmed-cases.json</code></pre>
+</div>
+</div>
+</section>
+<section id="file-contents" class="level6" data-number="5.1.1.3.1.3">
+<h6 data-number="5.1.1.3.1.3" class="anchored" data-anchor-id="file-contents"><span class="header-section-number">5.1.1.3.1.3</span> File Contents</h6>
+<p>Let’s explore the data format using <code>Python</code>.</p>
+<div id="fd8f66e4" class="cell" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(covid_file, <span class="st">"r"</span>) <span class="im">as</span> f:</span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> i, row <span class="kw">in</span> <span class="bu">enumerate</span>(f):</span>
+<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="bu">repr</span>(row)) <span class="co"># print raw strings</span></span>
+<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;=</span> <span class="dv">4</span>: <span class="cf">break</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>'{\n'
+'  "meta" : {\n'
+'    "view" : {\n'
+'      "id" : "xn6j-b766",\n'
+'      "name" : "COVID-19 Confirmed Cases",\n'</code></pre>
+</div>
+</div>
+<p>We can use the <code>head</code> Unix command (which is where <code>pandas</code>’ <code>head</code> method comes from!) to see the first few lines of the file:</p>
+<div id="f7a005ac" class="cell" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>head <span class="op">-</span><span class="dv">5</span> {covid_file}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>{
+  "meta" : {
+    "view" : {
+      "id" : "xn6j-b766",
+      "name" : "COVID-19 Confirmed Cases",</code></pre>
+</div>
+</div>
+<p>In order to load the JSON file into <code>pandas</code>, Let’s first do some EDA with Oython’s <code>json</code> package to understand the particular structure of this JSON file so that we can decide what (if anything) to load into <code>pandas</code>. Python has relatively good support for JSON data since it closely matches the internal python object model. In the following cell we import the entire JSON datafile into a python dictionary using the <code>json</code> package.</p>
+<div id="261554f7" class="cell" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> json</span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(covid_file, <span class="st">"rb"</span>) <span class="im">as</span> f:</span>
+<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a>    covid_json <span class="op">=</span> json.load(f)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>The <code>covid_json</code> variable is now a dictionary encoding the data in the file:</p>
+<div id="8be8a1a5" class="cell" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="bu">type</span>(covid_json)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<pre><code>dict</code></pre>
+</div>
+</div>
+<p>We can examine what keys are in the top level JSON object by listing out the keys.</p>
+<div id="f4d0e52f" class="cell" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>covid_json.keys()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="15">
+<pre><code>dict_keys(['meta', 'data'])</code></pre>
+</div>
+</div>
+<p><strong>Observation</strong>: The JSON dictionary contains a <code>meta</code> key which likely refers to metadata (data about the data). Metadata is often maintained with the data and can be a good source of additional information.</p>
+<p>We can investigate the metadata further by examining the keys associated with the metadata.</p>
+<div id="ebd7e119" class="cell" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>covid_json[<span class="st">'meta'</span>].keys()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="16">
+<pre><code>dict_keys(['view'])</code></pre>
+</div>
+</div>
+<p>The <code>meta</code> key contains another dictionary called <code>view</code>. This likely refers to metadata about a particular “view” of some underlying database. We will learn more about views when we study SQL later in the class.</p>
+<div id="0c5ce78c" class="cell" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a>covid_json[<span class="st">'meta'</span>][<span class="st">'view'</span>].keys()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="17">
+<pre><code>dict_keys(['id', 'name', 'assetType', 'attribution', 'averageRating', 'category', 'createdAt', 'description', 'displayType', 'downloadCount', 'hideFromCatalog', 'hideFromDataJson', 'newBackend', 'numberOfComments', 'oid', 'provenance', 'publicationAppendEnabled', 'publicationDate', 'publicationGroup', 'publicationStage', 'rowsUpdatedAt', 'rowsUpdatedBy', 'tableId', 'totalTimesRated', 'viewCount', 'viewLastModified', 'viewType', 'approvals', 'columns', 'grants', 'metadata', 'owner', 'query', 'rights', 'tableAuthor', 'tags', 'flags'])</code></pre>
+</div>
+</div>
+<p>Notice that this a nested/recursive data structure. As we dig deeper we reveal more and more keys and the corresponding data:</p>
+<pre><code>meta
+|-&gt; data
+    | ... (haven't explored yet)
+|-&gt; view
+    | -&gt; id
+    | -&gt; name
+    | -&gt; attribution 
+    ...
+    | -&gt; description
+    ...
+    | -&gt; columns
+    ...</code></pre>
+<p>There is a key called description in the view sub dictionary. This likely contains a description of the data:</p>
+<div id="5a3e1c9d" class="cell" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(covid_json[<span class="st">'meta'</span>][<span class="st">'view'</span>][<span class="st">'description'</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>Counts of confirmed COVID-19 cases among Berkeley residents by date.</code></pre>
+</div>
+</div>
+</section>
+<section id="examining-the-data-field-for-records" class="level6" data-number="5.1.1.3.1.4">
+<h6 data-number="5.1.1.3.1.4" class="anchored" data-anchor-id="examining-the-data-field-for-records"><span class="header-section-number">5.1.1.3.1.4</span> Examining the Data Field for Records</h6>
+<p>We can look at a few entries in the <code>data</code> field. This is what we’ll load into <code>pandas</code>.</p>
+<div id="f34b26ae" class="cell" data-execution_count="19">
+<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">3</span>):</span>
+<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"</span><span class="sc">{</span>i<span class="sc">:03}</span><span class="ss"> | </span><span class="sc">{</span>covid_json[<span class="st">'data'</span>][i]<span class="sc">}</span><span class="ss">"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>000 | ['row-kzbg.v7my-c3y2', '00000000-0000-0000-0405-CB14DE51DAA7', 0, 1643733903, None, 1643733903, None, '{ }', '2020-02-28T00:00:00', '1', '1']
+001 | ['row-jkyx_9u4r-h2yw', '00000000-0000-0000-F806-86D0DBE0E17F', 0, 1643733903, None, 1643733903, None, '{ }', '2020-02-29T00:00:00', '0', '1']
+002 | ['row-qifg_4aug-y3ym', '00000000-0000-0000-2DCE-4D1872F9B216', 0, 1643733903, None, 1643733903, None, '{ }', '2020-03-01T00:00:00', '0', '1']</code></pre>
+</div>
+</div>
+<p>Observations: * These look like equal-length records, so maybe <code>data</code> is a table! * But what do each of values in the record mean? Where can we find column headers?</p>
+<p>For that, we’ll need the <code>columns</code> key in the metadata dictionary. This returns a list:</p>
+<div id="be57a79d" class="cell" data-execution_count="20">
+<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="bu">type</span>(covid_json[<span class="st">'meta'</span>][<span class="st">'view'</span>][<span class="st">'columns'</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="20">
+<pre><code>list</code></pre>
+</div>
+</div>
+</section>
+<section id="summary-of-exploring-the-json-file" class="level6" data-number="5.1.1.3.1.5">
+<h6 data-number="5.1.1.3.1.5" class="anchored" data-anchor-id="summary-of-exploring-the-json-file"><span class="header-section-number">5.1.1.3.1.5</span> Summary of exploring the JSON file</h6>
+<ol type="1">
+<li>The above <strong>metadata</strong> tells us a lot about the columns in the data including column names, potential data anomalies, and a basic statistic.</li>
+<li>Because of its non-tabular structure, JSON makes it easier (than CSV) to create <strong>self-documenting data</strong>, meaning that information about the data is stored in the same file as the data.</li>
+<li>Self-documenting data can be helpful since it maintains its own description and these descriptions are more likely to be updated as data changes.</li>
+</ol>
+</section>
+<section id="loading-covid-data-into-pandas" class="level6" data-number="5.1.1.3.1.6">
+<h6 data-number="5.1.1.3.1.6" class="anchored" data-anchor-id="loading-covid-data-into-pandas"><span class="header-section-number">5.1.1.3.1.6</span> Loading COVID Data into <code>pandas</code></h6>
+<p>Finally, let’s load the data (not the metadata) into a <code>pandas</code> <code>DataFrame</code>. In the following block of code we:</p>
+<ol type="1">
+<li><p>Translate the JSON records into a <code>DataFrame</code>:</p>
+<ul>
+<li>fields: <code>covid_json['meta']['view']['columns']</code></li>
+<li>records: <code>covid_json['data']</code></li>
+</ul></li>
+<li><p>Remove columns that have no metadata description. This would be a bad idea in general, but here we remove these columns since the above analysis suggests they are unlikely to contain useful information.</p></li>
+<li><p>Examine the <code>tail</code> of the table.</p></li>
+</ol>
+<div id="407336f6" class="cell" data-execution_count="21">
+<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Load the data from JSON and assign column titles</span></span>
+<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a>covid <span class="op">=</span> pd.DataFrame(</span>
+<span id="cb38-3"><a href="#cb38-3" aria-hidden="true" tabindex="-1"></a>    covid_json[<span class="st">'data'</span>],</span>
+<span id="cb38-4"><a href="#cb38-4" aria-hidden="true" tabindex="-1"></a>    columns<span class="op">=</span>[c[<span class="st">'name'</span>] <span class="cf">for</span> c <span class="kw">in</span> covid_json[<span class="st">'meta'</span>][<span class="st">'view'</span>][<span class="st">'columns'</span>]])</span>
+<span id="cb38-5"><a href="#cb38-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb38-6"><a href="#cb38-6" aria-hidden="true" tabindex="-1"></a>covid.tail()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="21">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">sid</th>
+<th data-quarto-table-cell-role="th">id</th>
+<th data-quarto-table-cell-role="th">position</th>
+<th data-quarto-table-cell-role="th">created_at</th>
+<th data-quarto-table-cell-role="th">created_meta</th>
+<th data-quarto-table-cell-role="th">updated_at</th>
+<th data-quarto-table-cell-role="th">updated_meta</th>
+<th data-quarto-table-cell-role="th">meta</th>
+<th data-quarto-table-cell-role="th">Date</th>
+<th data-quarto-table-cell-role="th">New Cases</th>
+<th data-quarto-table-cell-role="th">Cumulative Cases</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">699</td>
+<td>row-49b6_x8zv.gyum</td>
+<td>00000000-0000-0000-A18C-9174A6D05774</td>
+<td>0</td>
+<td>1643733903</td>
+<td>None</td>
+<td>1643733903</td>
+<td>None</td>
+<td>{ }</td>
+<td>2022-01-27T00:00:00</td>
+<td>106</td>
+<td>10694</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">700</td>
+<td>row-gs55-p5em.y4v9</td>
+<td>00000000-0000-0000-F41D-5724AEABB4D6</td>
+<td>0</td>
+<td>1643733903</td>
+<td>None</td>
+<td>1643733903</td>
+<td>None</td>
+<td>{ }</td>
+<td>2022-01-28T00:00:00</td>
+<td>223</td>
+<td>10917</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">701</td>
+<td>row-3pyj.tf95-qu67</td>
+<td>00000000-0000-0000-BEE3-B0188D2518BD</td>
+<td>0</td>
+<td>1643733903</td>
+<td>None</td>
+<td>1643733903</td>
+<td>None</td>
+<td>{ }</td>
+<td>2022-01-29T00:00:00</td>
+<td>139</td>
+<td>11056</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">702</td>
+<td>row-cgnd.8syv.jvjn</td>
+<td>00000000-0000-0000-C318-63CF75F7F740</td>
+<td>0</td>
+<td>1643733903</td>
+<td>None</td>
+<td>1643733903</td>
+<td>None</td>
+<td>{ }</td>
+<td>2022-01-30T00:00:00</td>
+<td>33</td>
+<td>11089</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">703</td>
+<td>row-qywv_24x6-237y</td>
+<td>00000000-0000-0000-FE92-9789FED3AA20</td>
+<td>0</td>
+<td>1643733903</td>
+<td>None</td>
+<td>1643733903</td>
+<td>None</td>
+<td>{ }</td>
+<td>2022-01-31T00:00:00</td>
+<td>42</td>
+<td>11131</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+</section>
+</section>
+<section id="primary-and-foreign-keys" class="level3" data-number="5.1.2">
+<h3 data-number="5.1.2" class="anchored" data-anchor-id="primary-and-foreign-keys"><span class="header-section-number">5.1.2</span> Primary and Foreign Keys</h3>
+<p>Last time, we introduced <code>.merge</code> as the <code>pandas</code> method for joining multiple <code>DataFrame</code>s together. In our discussion of joins, we touched on the idea of using a “key” to determine what rows should be merged from each table. Let’s take a moment to examine this idea more closely.</p>
+<p>The <strong>primary key</strong> is the column or set of columns in a table that <em>uniquely</em> determine the values of the remaining columns. It can be thought of as the unique identifier for each individual row in the table. For example, a table of Data 100 students might use each student’s Cal ID as the primary key.</p>
+<div id="45fd1f7f" class="cell" data-execution_count="22">
+<div class="cell-output cell-output-display" data-execution_count="22">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Cal ID</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Major</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>3034619471</td>
+<td>Oski</td>
+<td>Data Science</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>3035619472</td>
+<td>Ollie</td>
+<td>Computer Science</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>3025619473</td>
+<td>Orrie</td>
+<td>Data Science</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>3046789372</td>
+<td>Ollie</td>
+<td>Economics</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>The <strong>foreign key</strong> is the column or set of columns in a table that reference primary keys in other tables. Knowing a dataset’s foreign keys can be useful when assigning the <code>left_on</code> and <code>right_on</code> parameters of <code>.merge</code>. In the table of office hour tickets below, <code>"Cal ID"</code> is a foreign key referencing the previous table.</p>
+<div id="9d77684a" class="cell" data-execution_count="23">
+<div class="cell-output cell-output-display" data-execution_count="23">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">OH Request</th>
+<th data-quarto-table-cell-role="th">Cal ID</th>
+<th data-quarto-table-cell-role="th">Question</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1</td>
+<td>3034619471</td>
+<td>HW 2 Q1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>2</td>
+<td>3035619472</td>
+<td>HW 2 Q3</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>3</td>
+<td>3025619473</td>
+<td>Lab 3 Q4</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>4</td>
+<td>3035619472</td>
+<td>HW 2 Q7</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="variable-types" class="level3" data-number="5.1.3">
+<h3 data-number="5.1.3" class="anchored" data-anchor-id="variable-types"><span class="header-section-number">5.1.3</span> Variable Types</h3>
+<p>Variables are columns. A variable is a measurement of a particular concept. Variables have two common properties: data type/storage type and variable type/feature type. The data type of a variable indicates how each variable value is stored in memory (integer, floating point, boolean, etc.) and affects which <code>pandas</code> functions are used. The variable type is a conceptualized measurement of information (and therefore indicates what values a variable can take on). Variable type is identified through expert knowledge, exploring the data itself, or consulting the data codebook. The variable type affects how one visualizes and inteprets the data. In this class, “variable types” are conceptual.</p>
+<p>After loading data into a file, it’s a good idea to take the time to understand what pieces of information are encoded in the dataset. In particular, we want to identify what variable types are present in our data. Broadly speaking, we can categorize variables into one of two overarching types.</p>
+<p><strong>Quantitative variables</strong> describe some numeric quantity or amount. We can divide quantitative data further into:</p>
+<ul>
+<li><strong>Continuous quantitative variables</strong>: numeric data that can be measured on a continuous scale to arbitrary precision. Continuous variables do not have a strict set of possible values – they can be recorded to any number of decimal places. For example, weights, GPA, or CO<sub>2</sub> concentrations.</li>
+<li><strong>Discrete quantitative variables</strong>: numeric data that can only take on a finite set of possible values. For example, someone’s age or the number of siblings they have.</li>
+</ul>
+<p><strong>Qualitative variables</strong>, also known as <strong>categorical variables</strong>, describe data that isn’t measuring some quantity or amount. The sub-categories of categorical data are:</p>
+<ul>
+<li><strong>Ordinal qualitative variables</strong>: categories with ordered levels. Specifically, ordinal variables are those where the difference between levels has no consistent, quantifiable meaning. Some examples include levels of education (high school, undergrad, grad, etc.), income bracket (low, medium, high), or Yelp rating.</li>
+<li><strong>Nominal qualitative variables</strong>: categories with no specific order. For example, someone’s political affiliation or Cal ID number.</li>
+</ul>
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="images/variable.png" class="img-fluid figure-img"></p>
+<figcaption>Classification of variable types</figcaption>
+</figure>
+</div>
+<p>Note that many variables don’t sit neatly in just one of these categories. Qualitative variables could have numeric levels, and conversely, quantitative variables could be stored as strings.</p>
+</section>
+</section>
+<section id="granularity-scope-and-temporality" class="level2" data-number="5.2">
+<h2 data-number="5.2" class="anchored" data-anchor-id="granularity-scope-and-temporality"><span class="header-section-number">5.2</span> Granularity, Scope, and Temporality</h2>
+<p>After understanding the structure of the dataset, the next task is to determine what exactly the data represents. We’ll do so by considering the data’s granularity, scope, and temporality.</p>
+<section id="granularity" class="level3" data-number="5.2.1">
+<h3 data-number="5.2.1" class="anchored" data-anchor-id="granularity"><span class="header-section-number">5.2.1</span> Granularity</h3>
+<p>The <strong>granularity</strong> of a dataset is what a single row represents. You can also think of it as the level of detail included in the data. To determine the data’s granularity, ask: what does each row in the dataset represent? Fine-grained data contains a high level of detail, with a single row representing a small individual unit. For example, each record may represent one person. Coarse-grained data is encoded such that a single row represents a large individual unit – for example, each record may represent a group of people.</p>
+</section>
+<section id="scope" class="level3" data-number="5.2.2">
+<h3 data-number="5.2.2" class="anchored" data-anchor-id="scope"><span class="header-section-number">5.2.2</span> Scope</h3>
+<p>The <strong>scope</strong> of a dataset is the subset of the population covered by the data. If we were investigating student performance in Data Science courses, a dataset with a narrow scope might encompass all students enrolled in Data 100 whereas a dataset with an expansive scope might encompass all students in California.</p>
+</section>
+<section id="temporality" class="level3" data-number="5.2.3">
+<h3 data-number="5.2.3" class="anchored" data-anchor-id="temporality"><span class="header-section-number">5.2.3</span> Temporality</h3>
+<p>The <strong>temporality</strong> of a dataset describes the periodicity over which the data was collected as well as when the data was most recently collected or updated.</p>
+<p>Time and date fields of a dataset could represent a few things:</p>
+<ol type="1">
+<li>when the “event” happened</li>
+<li>when the data was collected, or when it was entered into the system</li>
+<li>when the data was copied into the database</li>
+</ol>
+<p>To fully understand the temporality of the data, it also may be necessary to standardize time zones or inspect recurring time-based trends in the data (do patterns recur in 24-hour periods? Over the course of a month? Seasonally?). The convention for standardizing time is the Coordinated Universal Time (UTC), an international time standard measured at 0 degrees latitude that stays consistent throughout the year (no daylight savings). We can represent Berkeley’s time zone, Pacific Standard Time (PST), as UTC-7 (with daylight savings).</p>
+<section id="temporality-with-pandas-dt-accessors" class="level4" data-number="5.2.3.1">
+<h4 data-number="5.2.3.1" class="anchored" data-anchor-id="temporality-with-pandas-dt-accessors"><span class="header-section-number">5.2.3.1</span> Temporality with <code>pandas</code>’ <code>dt</code> accessors</h4>
+<p>Let’s briefly look at how we can use <code>pandas</code>’ <code>dt</code> accessors to work with dates/times in a dataset using the dataset you’ll see in Lab 3: the Berkeley PD Calls for Service dataset.</p>
+<div id="8caa753a" class="cell" data-execution_count="24">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb39"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a>calls <span class="op">=</span> pd.read_csv(<span class="st">"data/Berkeley_PD_-_Calls_for_Service.csv"</span>)</span>
+<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a>calls.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="24">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">CASENO</th>
+<th data-quarto-table-cell-role="th">OFFENSE</th>
+<th data-quarto-table-cell-role="th">EVENTDT</th>
+<th data-quarto-table-cell-role="th">EVENTTM</th>
+<th data-quarto-table-cell-role="th">CVLEGEND</th>
+<th data-quarto-table-cell-role="th">CVDOW</th>
+<th data-quarto-table-cell-role="th">InDbDate</th>
+<th data-quarto-table-cell-role="th">Block_Location</th>
+<th data-quarto-table-cell-role="th">BLKADDR</th>
+<th data-quarto-table-cell-role="th">City</th>
+<th data-quarto-table-cell-role="th">State</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>21014296</td>
+<td>THEFT MISD. (UNDER $950)</td>
+<td>04/01/2021 12:00:00 AM</td>
+<td>10:58</td>
+<td>LARCENY</td>
+<td>4</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>Berkeley, CA\n(37.869058, -122.270455)</td>
+<td>NaN</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>21014391</td>
+<td>THEFT MISD. (UNDER $950)</td>
+<td>04/01/2021 12:00:00 AM</td>
+<td>10:38</td>
+<td>LARCENY</td>
+<td>4</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>Berkeley, CA\n(37.869058, -122.270455)</td>
+<td>NaN</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>21090494</td>
+<td>THEFT MISD. (UNDER $950)</td>
+<td>04/19/2021 12:00:00 AM</td>
+<td>12:15</td>
+<td>LARCENY</td>
+<td>1</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>2100 BLOCK HASTE ST\nBerkeley, CA\n(37.864908,...</td>
+<td>2100 BLOCK HASTE ST</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>21090204</td>
+<td>THEFT FELONY (OVER $950)</td>
+<td>02/13/2021 12:00:00 AM</td>
+<td>17:00</td>
+<td>LARCENY</td>
+<td>6</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>2600 BLOCK WARRING ST\nBerkeley, CA\n(37.86393...</td>
+<td>2600 BLOCK WARRING ST</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>21090179</td>
+<td>BURGLARY AUTO</td>
+<td>02/08/2021 12:00:00 AM</td>
+<td>6:20</td>
+<td>BURGLARY - VEHICLE</td>
+<td>1</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>2700 BLOCK GARBER ST\nBerkeley, CA\n(37.86066,...</td>
+<td>2700 BLOCK GARBER ST</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Looks like there are three columns with dates/times: <code>EVENTDT</code>, <code>EVENTTM</code>, and <code>InDbDate</code>.</p>
+<p>Most likely, <code>EVENTDT</code> stands for the date when the event took place, <code>EVENTTM</code> stands for the time of day the event took place (in 24-hr format), and <code>InDbDate</code> is the date this call is recorded onto the database.</p>
+<p>If we check the data type of these columns, we will see they are stored as strings. We can convert them to <code>datetime</code> objects using pandas <code>to_datetime</code> function.</p>
+<div id="43a96bab" class="cell" data-execution_count="25">
+<div class="sourceCode cell-code" id="cb40"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a>calls[<span class="st">"EVENTDT"</span>] <span class="op">=</span> pd.to_datetime(calls[<span class="st">"EVENTDT"</span>])</span>
+<span id="cb40-2"><a href="#cb40-2" aria-hidden="true" tabindex="-1"></a>calls.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57895/874729699.py:1: UserWarning:
+
+Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="25">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">CASENO</th>
+<th data-quarto-table-cell-role="th">OFFENSE</th>
+<th data-quarto-table-cell-role="th">EVENTDT</th>
+<th data-quarto-table-cell-role="th">EVENTTM</th>
+<th data-quarto-table-cell-role="th">CVLEGEND</th>
+<th data-quarto-table-cell-role="th">CVDOW</th>
+<th data-quarto-table-cell-role="th">InDbDate</th>
+<th data-quarto-table-cell-role="th">Block_Location</th>
+<th data-quarto-table-cell-role="th">BLKADDR</th>
+<th data-quarto-table-cell-role="th">City</th>
+<th data-quarto-table-cell-role="th">State</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>21014296</td>
+<td>THEFT MISD. (UNDER $950)</td>
+<td>2021-04-01</td>
+<td>10:58</td>
+<td>LARCENY</td>
+<td>4</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>Berkeley, CA\n(37.869058, -122.270455)</td>
+<td>NaN</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>21014391</td>
+<td>THEFT MISD. (UNDER $950)</td>
+<td>2021-04-01</td>
+<td>10:38</td>
+<td>LARCENY</td>
+<td>4</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>Berkeley, CA\n(37.869058, -122.270455)</td>
+<td>NaN</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>21090494</td>
+<td>THEFT MISD. (UNDER $950)</td>
+<td>2021-04-19</td>
+<td>12:15</td>
+<td>LARCENY</td>
+<td>1</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>2100 BLOCK HASTE ST\nBerkeley, CA\n(37.864908,...</td>
+<td>2100 BLOCK HASTE ST</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>21090204</td>
+<td>THEFT FELONY (OVER $950)</td>
+<td>2021-02-13</td>
+<td>17:00</td>
+<td>LARCENY</td>
+<td>6</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>2600 BLOCK WARRING ST\nBerkeley, CA\n(37.86393...</td>
+<td>2600 BLOCK WARRING ST</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>21090179</td>
+<td>BURGLARY AUTO</td>
+<td>2021-02-08</td>
+<td>6:20</td>
+<td>BURGLARY - VEHICLE</td>
+<td>1</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>2700 BLOCK GARBER ST\nBerkeley, CA\n(37.86066,...</td>
+<td>2700 BLOCK GARBER ST</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Now, we can use the <code>dt</code> accessor on this column.</p>
+<p>We can get the month:</p>
+<div id="839a8f4b" class="cell" data-execution_count="26">
+<div class="sourceCode cell-code" id="cb42"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a>calls[<span class="st">"EVENTDT"</span>].dt.month.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="26">
+<pre><code>0    4
+1    4
+2    4
+3    2
+4    2
+Name: EVENTDT, dtype: int32</code></pre>
+</div>
+</div>
+<p>Which day of the week the date is on:</p>
+<div id="d35dea0a" class="cell" data-execution_count="27">
+<div class="sourceCode cell-code" id="cb44"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb44-1"><a href="#cb44-1" aria-hidden="true" tabindex="-1"></a>calls[<span class="st">"EVENTDT"</span>].dt.dayofweek.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="27">
+<pre><code>0    3
+1    3
+2    0
+3    5
+4    0
+Name: EVENTDT, dtype: int32</code></pre>
+</div>
+</div>
+<p>Check the mimimum values to see if there are any suspicious-looking, 70s dates:</p>
+<div id="31ed31c5" class="cell" data-execution_count="28">
+<div class="sourceCode cell-code" id="cb46"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a>calls.sort_values(<span class="st">"EVENTDT"</span>).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="28">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">CASENO</th>
+<th data-quarto-table-cell-role="th">OFFENSE</th>
+<th data-quarto-table-cell-role="th">EVENTDT</th>
+<th data-quarto-table-cell-role="th">EVENTTM</th>
+<th data-quarto-table-cell-role="th">CVLEGEND</th>
+<th data-quarto-table-cell-role="th">CVDOW</th>
+<th data-quarto-table-cell-role="th">InDbDate</th>
+<th data-quarto-table-cell-role="th">Block_Location</th>
+<th data-quarto-table-cell-role="th">BLKADDR</th>
+<th data-quarto-table-cell-role="th">City</th>
+<th data-quarto-table-cell-role="th">State</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2513</td>
+<td>20057398</td>
+<td>BURGLARY COMMERCIAL</td>
+<td>2020-12-17</td>
+<td>16:05</td>
+<td>BURGLARY - COMMERCIAL</td>
+<td>4</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>600 BLOCK GILMAN ST\nBerkeley, CA\n(37.878405,...</td>
+<td>600 BLOCK GILMAN ST</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">624</td>
+<td>20057207</td>
+<td>ASSAULT/BATTERY MISD.</td>
+<td>2020-12-17</td>
+<td>16:50</td>
+<td>ASSAULT</td>
+<td>4</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>2100 BLOCK SHATTUCK AVE\nBerkeley, CA\n(37.871...</td>
+<td>2100 BLOCK SHATTUCK AVE</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">154</td>
+<td>20092214</td>
+<td>THEFT FROM AUTO</td>
+<td>2020-12-17</td>
+<td>18:30</td>
+<td>LARCENY - FROM VEHICLE</td>
+<td>4</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>800 BLOCK SHATTUCK AVE\nBerkeley, CA\n(37.8918...</td>
+<td>800 BLOCK SHATTUCK AVE</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">659</td>
+<td>20057324</td>
+<td>THEFT MISD. (UNDER $950)</td>
+<td>2020-12-17</td>
+<td>15:44</td>
+<td>LARCENY</td>
+<td>4</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>1800 BLOCK 4TH ST\nBerkeley, CA\n(37.869888, -...</td>
+<td>1800 BLOCK 4TH ST</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">993</td>
+<td>20057573</td>
+<td>BURGLARY RESIDENTIAL</td>
+<td>2020-12-17</td>
+<td>22:15</td>
+<td>BURGLARY - RESIDENTIAL</td>
+<td>4</td>
+<td>06/15/2021 12:00:00 AM</td>
+<td>1700 BLOCK STUART ST\nBerkeley, CA\n(37.857495...</td>
+<td>1700 BLOCK STUART ST</td>
+<td>Berkeley</td>
+<td>CA</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Doesn’t look like it! We are good!</p>
+<p>We can also do many things with the <code>dt</code> accessor like switching time zones and converting time back to UNIX/POSIX time. Check out the documentation on <a href="https://pandas.pydata.org/docs/user_guide/basics.html#basics-dt-accessors"><code>.dt</code> accessor</a> and <a href="https://pandas.pydata.org/docs/user_guide/timeseries.html#">time series/date functionality</a>.</p>
+</section>
+</section>
+</section>
+<section id="faithfulness" class="level2" data-number="5.3">
+<h2 data-number="5.3" class="anchored" data-anchor-id="faithfulness"><span class="header-section-number">5.3</span> Faithfulness</h2>
+<p>At this stage in our data cleaning and EDA workflow, we’ve achieved quite a lot: we’ve identified how our data is structured, come to terms with what information it encodes, and gained insight as to how it was generated. Throughout this process, we should always recall the original intent of our work in Data Science – to use data to better understand and model the real world. To achieve this goal, we need to ensure that the data we use is faithful to reality; that is, that our data accurately captures the “real world.”</p>
+<p>Data used in research or industry is often “messy” – there may be errors or inaccuracies that impact the faithfulness of the dataset. Signs that data may not be faithful include:</p>
+<ul>
+<li>Unrealistic or “incorrect” values, such as negative counts, locations that don’t exist, or dates set in the future</li>
+<li>Violations of obvious dependencies, like an age that does not match a birthday</li>
+<li>Clear signs that data was entered by hand, which can lead to spelling errors or fields that are incorrectly shifted</li>
+<li>Signs of data falsification, such as fake email addresses or repeated use of the same names</li>
+<li>Duplicated records or fields containing the same information</li>
+<li>Truncated data, e.g.&nbsp;Microsoft Excel would limit the number of rows to 655536 and the number of columns to 255</li>
+</ul>
+<p>We often solve some of these more common issues in the following ways:</p>
+<ul>
+<li>Spelling errors: apply corrections or drop records that aren’t in a dictionary</li>
+<li>Time zone inconsistencies: convert to a common time zone (e.g.&nbsp;UTC)</li>
+<li>Duplicated records or fields: identify and eliminate duplicates (using primary keys)</li>
+<li>Unspecified or inconsistent units: infer the units and check that values are in reasonable ranges in the data</li>
+</ul>
+<section id="missing-values" class="level3" data-number="5.3.1">
+<h3 data-number="5.3.1" class="anchored" data-anchor-id="missing-values"><span class="header-section-number">5.3.1</span> Missing Values</h3>
+<p>Another common issue encountered with real-world datasets is that of missing data. One strategy to resolve this is to simply drop any records with missing values from the dataset. This does, however, introduce the risk of inducing biases – it is possible that the missing or corrupt records may be systemically related to some feature of interest in the data. Another solution is to keep the data as <code>NaN</code> values.</p>
+<p>A third method to address missing data is to perform <strong>imputation</strong>: infer the missing values using other data available in the dataset. There is a wide variety of imputation techniques that can be implemented; some of the most common are listed below.</p>
+<ul>
+<li>Average imputation: replace missing values with the average value for that field</li>
+<li>Hot deck imputation: replace missing values with some random value</li>
+<li>Regression imputation: develop a model to predict missing values and replace with the predicted value from the model.</li>
+<li>Multiple imputation: replace missing values with multiple random values</li>
+</ul>
+<p>Regardless of the strategy used to deal with missing data, we should think carefully about <em>why</em> particular records or fields may be missing – this can help inform whether or not the absence of these values is significant or meaningful.</p>
+</section>
+</section>
+<section id="eda-demo-1-tuberculosis-in-the-united-states" class="level2" data-number="5.4">
+<h2 data-number="5.4" class="anchored" data-anchor-id="eda-demo-1-tuberculosis-in-the-united-states"><span class="header-section-number">5.4</span> EDA Demo 1: Tuberculosis in the United States</h2>
+<p>Now, let’s walk through the data-cleaning and EDA workflow to see what can we learn about the presence of Tuberculosis in the United States!</p>
+<p>We will examine the data included in the <a href="https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w#T1_down">original CDC article</a> published in 2021.</p>
+<section id="csvs-and-field-names" class="level3" data-number="5.4.1">
+<h3 data-number="5.4.1" class="anchored" data-anchor-id="csvs-and-field-names"><span class="header-section-number">5.4.1</span> CSVs and Field Names</h3>
+<p>Suppose Table 1 was saved as a CSV file located in <code>data/cdc_tuberculosis.csv</code>.</p>
+<p>We can then explore the CSV (which is a text file, and does not contain binary-encoded data) in many ways: 1. Using a text editor like emacs, vim, VSCode, etc. 2. Opening the CSV directly in DataHub (read-only), Excel, Google Sheets, etc. 3. The <code>Python</code> file object 4. <code>pandas</code>, using <code>pd.read_csv()</code></p>
+<p>To try out options 1 and 2, you can view or download the Tuberculosis from the <a href="https://data100.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FDS-100%2Ffa23-student&amp;urlpath=lab%2Ftree%2Ffa23-student%2Flecture%2Flec05%2Flec04-eda.ipynb&amp;branch=main">lecture demo notebook</a> under the <code>data</code> folder in the left hand menu. Notice how the CSV file is a type of <strong>rectangular data (i.e., tabular data) stored as comma-separated values</strong>.</p>
+<p>Next, let’s try out option 3 using the <code>Python</code> file object. We’ll look at the first four lines:</p>
+<div id="cd3d8b01" class="cell" data-execution_count="29">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb47"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/cdc_tuberculosis.csv"</span>, <span class="st">"r"</span>) <span class="im">as</span> f:</span>
+<span id="cb47-2"><a href="#cb47-2" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb47-3"><a href="#cb47-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> f:</span>
+<span id="cb47-4"><a href="#cb47-4" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(row)</span>
+<span id="cb47-5"><a href="#cb47-5" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb47-6"><a href="#cb47-6" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">3</span>:</span>
+<span id="cb47-7"><a href="#cb47-7" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>,No. of TB cases,,,TB incidence,,
+
+U.S. jurisdiction,2019,2020,2021,2019,2020,2021
+
+Total,"8,900","7,173","7,860",2.71,2.16,2.37
+
+Alabama,87,72,92,1.77,1.43,1.83
+</code></pre>
+</div>
+</div>
+<p>Whoa, why are there blank lines interspaced between the lines of the CSV?</p>
+<p>You may recall that all line breaks in text files are encoded as the special newline character <code>\n</code>. Python’s <code>print()</code> prints each string (including the newline), and an additional newline on top of that.</p>
+<p>If you’re curious, we can use the <code>repr()</code> function to return the raw string with all special characters:</p>
+<div id="80ae9682" class="cell" data-execution_count="30">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb49"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/cdc_tuberculosis.csv"</span>, <span class="st">"r"</span>) <span class="im">as</span> f:</span>
+<span id="cb49-2"><a href="#cb49-2" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb49-3"><a href="#cb49-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> f:</span>
+<span id="cb49-4"><a href="#cb49-4" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="bu">repr</span>(row)) <span class="co"># print raw strings</span></span>
+<span id="cb49-5"><a href="#cb49-5" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb49-6"><a href="#cb49-6" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">3</span>:</span>
+<span id="cb49-7"><a href="#cb49-7" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>',No. of TB cases,,,TB incidence,,\n'
+'U.S. jurisdiction,2019,2020,2021,2019,2020,2021\n'
+'Total,"8,900","7,173","7,860",2.71,2.16,2.37\n'
+'Alabama,87,72,92,1.77,1.43,1.83\n'</code></pre>
+</div>
+</div>
+<p>Finally, let’s try option 4 and use the tried-and-true Data 100 approach: <code>pandas</code>.</p>
+<div id="06ad687a" class="cell" data-execution_count="31">
+<div class="sourceCode cell-code" id="cb51"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb51-1"><a href="#cb51-1" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> pd.read_csv(<span class="st">"data/cdc_tuberculosis.csv"</span>)</span>
+<span id="cb51-2"><a href="#cb51-2" aria-hidden="true" tabindex="-1"></a>tb_df.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="31">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Unnamed: 0</th>
+<th data-quarto-table-cell-role="th">No. of TB cases</th>
+<th data-quarto-table-cell-role="th">Unnamed: 2</th>
+<th data-quarto-table-cell-role="th">Unnamed: 3</th>
+<th data-quarto-table-cell-role="th">TB incidence</th>
+<th data-quarto-table-cell-role="th">Unnamed: 5</th>
+<th data-quarto-table-cell-role="th">Unnamed: 6</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>U.S. jurisdiction</td>
+<td>2019</td>
+<td>2020</td>
+<td>2021</td>
+<td>2019.00</td>
+<td>2020.00</td>
+<td>2021.00</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Total</td>
+<td>8,900</td>
+<td>7,173</td>
+<td>7,860</td>
+<td>2.71</td>
+<td>2.16</td>
+<td>2.37</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>You may notice some strange things about this table: what’s up with the “Unnamed” column names and the first row?</p>
+<p>Congratulations — you’re ready to wrangle your data! Because of how things are stored, we’ll need to clean the data a bit to name our columns better.</p>
+<p>A reasonable first step is to identify the row with the right header. The <code>pd.read_csv()</code> function (<a href="https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html">documentation</a>) has the convenient <code>header</code> parameter that we can set to use the elements in row 1 as the appropriate columns:</p>
+<div id="60f903e1" class="cell" data-execution_count="32">
+<div class="sourceCode cell-code" id="cb52"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> pd.read_csv(<span class="st">"data/cdc_tuberculosis.csv"</span>, header<span class="op">=</span><span class="dv">1</span>) <span class="co"># row index</span></span>
+<span id="cb52-2"><a href="#cb52-2" aria-hidden="true" tabindex="-1"></a>tb_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="32">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th">2019</th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+<th data-quarto-table-cell-role="th">2019.1</th>
+<th data-quarto-table-cell-role="th">2020.1</th>
+<th data-quarto-table-cell-role="th">2021.1</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Total</td>
+<td>8,900</td>
+<td>7,173</td>
+<td>7,860</td>
+<td>2.71</td>
+<td>2.16</td>
+<td>2.37</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Wait…but now we can’t differentiate betwen the “Number of TB cases” and “TB incidence” year columns. <code>pandas</code> has tried to make our lives easier by automatically adding “.1” to the latter columns, but this doesn’t help us, as humans, understand the data.</p>
+<p>We can do this manually with <code>df.rename()</code> (<a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html?highlight=rename#pandas.DataFrame.rename">documentation</a>):</p>
+<div id="8e429a5e" class="cell" data-execution_count="33">
+<div class="sourceCode cell-code" id="cb53"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb53-1"><a href="#cb53-1" aria-hidden="true" tabindex="-1"></a>rename_dict <span class="op">=</span> {<span class="st">'2019'</span>: <span class="st">'TB cases 2019'</span>,</span>
+<span id="cb53-2"><a href="#cb53-2" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2020'</span>: <span class="st">'TB cases 2020'</span>,</span>
+<span id="cb53-3"><a href="#cb53-3" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2021'</span>: <span class="st">'TB cases 2021'</span>,</span>
+<span id="cb53-4"><a href="#cb53-4" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2019.1'</span>: <span class="st">'TB incidence 2019'</span>,</span>
+<span id="cb53-5"><a href="#cb53-5" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2020.1'</span>: <span class="st">'TB incidence 2020'</span>,</span>
+<span id="cb53-6"><a href="#cb53-6" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2021.1'</span>: <span class="st">'TB incidence 2021'</span>}</span>
+<span id="cb53-7"><a href="#cb53-7" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> tb_df.rename(columns<span class="op">=</span>rename_dict)</span>
+<span id="cb53-8"><a href="#cb53-8" aria-hidden="true" tabindex="-1"></a>tb_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="33">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Total</td>
+<td>8,900</td>
+<td>7,173</td>
+<td>7,860</td>
+<td>2.71</td>
+<td>2.16</td>
+<td>2.37</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="record-granularity" class="level3" data-number="5.4.2">
+<h3 data-number="5.4.2" class="anchored" data-anchor-id="record-granularity"><span class="header-section-number">5.4.2</span> Record Granularity</h3>
+<p>You might already be wondering: what’s up with that first record?</p>
+<p>Row 0 is what we call a <strong>rollup record</strong>, or summary record. It’s often useful when displaying tables to humans. The <strong>granularity</strong> of record 0 (Totals) vs the rest of the records (States) is different.</p>
+<p>Okay, EDA step two. How was the rollup record aggregated?</p>
+<p>Let’s check if Total TB cases is the sum of all state TB cases. If we sum over all rows, we should get <strong>2x</strong> the total cases in each of our TB cases by year (why do you think this is?).</p>
+<div id="db89fb5c" class="cell" data-execution_count="34">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb54"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a>tb_df.<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="34">
+<pre><code>U.S. jurisdiction    TotalAlabamaAlaskaArizonaArkansasCaliforniaCol...
+TB cases 2019        8,9008758183642,111666718245583029973261085237...
+TB cases 2020        7,1737258136591,706525417194122219282169239376...
+TB cases 2021        7,8609258129691,750585443194992281064255127494...
+TB incidence 2019                                               109.94
+TB incidence 2020                                                93.09
+TB incidence 2021                                               102.94
+dtype: object</code></pre>
+</div>
+</div>
+<p>Whoa, what’s going on with the TB cases in 2019, 2020, and 2021? Check out the column types:</p>
+<div id="792f24a8" class="cell" data-execution_count="35">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb56"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a>tb_df.dtypes</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="35">
+<pre><code>U.S. jurisdiction     object
+TB cases 2019         object
+TB cases 2020         object
+TB cases 2021         object
+TB incidence 2019    float64
+TB incidence 2020    float64
+TB incidence 2021    float64
+dtype: object</code></pre>
+</div>
+</div>
+<p>Since there are commas in the values for TB cases, the numbers are read as the <code>object</code> datatype, or <strong>storage type</strong> (close to the <code>Python</code> string datatype), so <code>pandas</code> is concatenating strings instead of adding integers (recall that Python can “sum”, or concatenate, strings together: <code>"data" + "100"</code> evaluates to <code>"data100"</code>).</p>
+<p>Fortunately <code>read_csv</code> also has a <code>thousands</code> parameter (<a href="https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html">documentation</a>):</p>
+<div id="fd01467e" class="cell" data-execution_count="36">
+<div class="sourceCode cell-code" id="cb58"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb58-1"><a href="#cb58-1" aria-hidden="true" tabindex="-1"></a><span class="co"># improve readability: chaining method calls with outer parentheses/line breaks</span></span>
+<span id="cb58-2"><a href="#cb58-2" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> (</span>
+<span id="cb58-3"><a href="#cb58-3" aria-hidden="true" tabindex="-1"></a>    pd.read_csv(<span class="st">"data/cdc_tuberculosis.csv"</span>, header<span class="op">=</span><span class="dv">1</span>, thousands<span class="op">=</span><span class="st">','</span>)</span>
+<span id="cb58-4"><a href="#cb58-4" aria-hidden="true" tabindex="-1"></a>    .rename(columns<span class="op">=</span>rename_dict)</span>
+<span id="cb58-5"><a href="#cb58-5" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb58-6"><a href="#cb58-6" aria-hidden="true" tabindex="-1"></a>tb_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="36">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Total</td>
+<td>8900</td>
+<td>7173</td>
+<td>7860</td>
+<td>2.71</td>
+<td>2.16</td>
+<td>2.37</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="d70e9bc8" class="cell" data-execution_count="37">
+<div class="sourceCode cell-code" id="cb59"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb59-1"><a href="#cb59-1" aria-hidden="true" tabindex="-1"></a>tb_df.<span class="bu">sum</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="37">
+<pre><code>U.S. jurisdiction    TotalAlabamaAlaskaArizonaArkansasCaliforniaCol...
+TB cases 2019                                                    17800
+TB cases 2020                                                    14346
+TB cases 2021                                                    15720
+TB incidence 2019                                               109.94
+TB incidence 2020                                                93.09
+TB incidence 2021                                               102.94
+dtype: object</code></pre>
+</div>
+</div>
+<p>The total TB cases look right. Phew!</p>
+<p>Let’s just look at the records with <strong>state-level granularity</strong>:</p>
+<div id="92839bee" class="cell" data-execution_count="38">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb61"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb61-1"><a href="#cb61-1" aria-hidden="true" tabindex="-1"></a>state_tb_df <span class="op">=</span> tb_df[<span class="dv">1</span>:]</span>
+<span id="cb61-2"><a href="#cb61-2" aria-hidden="true" tabindex="-1"></a>state_tb_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="38">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">4</td>
+<td>Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">5</td>
+<td>California</td>
+<td>2111</td>
+<td>1706</td>
+<td>1750</td>
+<td>5.35</td>
+<td>4.32</td>
+<td>4.46</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="gather-census-data" class="level3" data-number="5.4.3">
+<h3 data-number="5.4.3" class="anchored" data-anchor-id="gather-census-data"><span class="header-section-number">5.4.3</span> Gather Census Data</h3>
+<p>U.S. Census population estimates <a href="https://www.census.gov/data/tables/time-series/demo/popest/2010s-state-total.html">source</a> (2019), <a href="https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-total.html">source</a> (2020-2021).</p>
+<p>Running the below cells cleans the data. There are a few new methods here: * <code>df.convert_dtypes()</code> (<a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.convert_dtypes.html">documentation</a>) conveniently converts all float dtypes into ints and is out of scope for the class. * <code>df.drop_na()</code> (<a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html">documentation</a>) will be explained in more detail next time.</p>
+<div id="ec03087d" class="cell" data-execution_count="39">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb62"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb62-1"><a href="#cb62-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 2010s census data</span></span>
+<span id="cb62-2"><a href="#cb62-2" aria-hidden="true" tabindex="-1"></a>census_2010s_df <span class="op">=</span> pd.read_csv(<span class="st">"data/nst-est2019-01.csv"</span>, header<span class="op">=</span><span class="dv">3</span>, thousands<span class="op">=</span><span class="st">","</span>)</span>
+<span id="cb62-3"><a href="#cb62-3" aria-hidden="true" tabindex="-1"></a>census_2010s_df <span class="op">=</span> (</span>
+<span id="cb62-4"><a href="#cb62-4" aria-hidden="true" tabindex="-1"></a>    census_2010s_df</span>
+<span id="cb62-5"><a href="#cb62-5" aria-hidden="true" tabindex="-1"></a>    .reset_index()</span>
+<span id="cb62-6"><a href="#cb62-6" aria-hidden="true" tabindex="-1"></a>    .drop(columns<span class="op">=</span>[<span class="st">"index"</span>, <span class="st">"Census"</span>, <span class="st">"Estimates Base"</span>])</span>
+<span id="cb62-7"><a href="#cb62-7" aria-hidden="true" tabindex="-1"></a>    .rename(columns<span class="op">=</span>{<span class="st">"Unnamed: 0"</span>: <span class="st">"Geographic Area"</span>})</span>
+<span id="cb62-8"><a href="#cb62-8" aria-hidden="true" tabindex="-1"></a>    .convert_dtypes()                 <span class="co"># "smart" converting of columns, use at your own risk</span></span>
+<span id="cb62-9"><a href="#cb62-9" aria-hidden="true" tabindex="-1"></a>    .dropna()                         <span class="co"># we'll introduce this next time</span></span>
+<span id="cb62-10"><a href="#cb62-10" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb62-11"><a href="#cb62-11" aria-hidden="true" tabindex="-1"></a>census_2010s_df[<span class="st">'Geographic Area'</span>] <span class="op">=</span> census_2010s_df[<span class="st">'Geographic Area'</span>].<span class="bu">str</span>.strip(<span class="st">'.'</span>)</span>
+<span id="cb62-12"><a href="#cb62-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb62-13"><a href="#cb62-13" aria-hidden="true" tabindex="-1"></a><span class="co"># with pd.option_context('display.min_rows', 30): # shows more rows</span></span>
+<span id="cb62-14"><a href="#cb62-14" aria-hidden="true" tabindex="-1"></a><span class="co">#     display(census_2010s_df)</span></span>
+<span id="cb62-15"><a href="#cb62-15" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb62-16"><a href="#cb62-16" aria-hidden="true" tabindex="-1"></a>census_2010s_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="39">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Geographic Area</th>
+<th data-quarto-table-cell-role="th">2010</th>
+<th data-quarto-table-cell-role="th">2011</th>
+<th data-quarto-table-cell-role="th">2012</th>
+<th data-quarto-table-cell-role="th">2013</th>
+<th data-quarto-table-cell-role="th">2014</th>
+<th data-quarto-table-cell-role="th">2015</th>
+<th data-quarto-table-cell-role="th">2016</th>
+<th data-quarto-table-cell-role="th">2017</th>
+<th data-quarto-table-cell-role="th">2018</th>
+<th data-quarto-table-cell-role="th">2019</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>United States</td>
+<td>309321666</td>
+<td>311556874</td>
+<td>313830990</td>
+<td>315993715</td>
+<td>318301008</td>
+<td>320635163</td>
+<td>322941311</td>
+<td>324985539</td>
+<td>326687501</td>
+<td>328239523</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Northeast</td>
+<td>55380134</td>
+<td>55604223</td>
+<td>55775216</td>
+<td>55901806</td>
+<td>56006011</td>
+<td>56034684</td>
+<td>56042330</td>
+<td>56059240</td>
+<td>56046620</td>
+<td>55982803</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Midwest</td>
+<td>66974416</td>
+<td>67157800</td>
+<td>67336743</td>
+<td>67560379</td>
+<td>67745167</td>
+<td>67860583</td>
+<td>67987540</td>
+<td>68126781</td>
+<td>68236628</td>
+<td>68329004</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>South</td>
+<td>114866680</td>
+<td>116006522</td>
+<td>117241208</td>
+<td>118364400</td>
+<td>119624037</td>
+<td>120997341</td>
+<td>122351760</td>
+<td>123542189</td>
+<td>124569433</td>
+<td>125580448</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>West</td>
+<td>72100436</td>
+<td>72788329</td>
+<td>73477823</td>
+<td>74167130</td>
+<td>74925793</td>
+<td>75742555</td>
+<td>76559681</td>
+<td>77257329</td>
+<td>77834820</td>
+<td>78347268</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Occasionally, you will want to modify code that you have imported. To reimport those modifications you can either use <code>python</code>’s <code>importlib</code> library:</p>
+<div class="sourceCode" id="cb63"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> importlib <span class="im">import</span> <span class="bu">reload</span></span>
+<span id="cb63-2"><a href="#cb63-2" aria-hidden="true" tabindex="-1"></a><span class="bu">reload</span>(utils)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>or use <code>iPython</code> magic which will intelligently import code when files change:</p>
+<div class="sourceCode" id="cb64"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb64-1"><a href="#cb64-1" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>load_ext autoreload</span>
+<span id="cb64-2"><a href="#cb64-2" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>autoreload <span class="dv">2</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div id="7322e830" class="cell" data-execution_count="40">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb65"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb65-1"><a href="#cb65-1" aria-hidden="true" tabindex="-1"></a><span class="co"># census 2020s data</span></span>
+<span id="cb65-2"><a href="#cb65-2" aria-hidden="true" tabindex="-1"></a>census_2020s_df <span class="op">=</span> pd.read_csv(<span class="st">"data/NST-EST2022-POP.csv"</span>, header<span class="op">=</span><span class="dv">3</span>, thousands<span class="op">=</span><span class="st">","</span>)</span>
+<span id="cb65-3"><a href="#cb65-3" aria-hidden="true" tabindex="-1"></a>census_2020s_df <span class="op">=</span> (</span>
+<span id="cb65-4"><a href="#cb65-4" aria-hidden="true" tabindex="-1"></a>    census_2020s_df</span>
+<span id="cb65-5"><a href="#cb65-5" aria-hidden="true" tabindex="-1"></a>    .reset_index()</span>
+<span id="cb65-6"><a href="#cb65-6" aria-hidden="true" tabindex="-1"></a>    .drop(columns<span class="op">=</span>[<span class="st">"index"</span>, <span class="st">"Unnamed: 1"</span>])</span>
+<span id="cb65-7"><a href="#cb65-7" aria-hidden="true" tabindex="-1"></a>    .rename(columns<span class="op">=</span>{<span class="st">"Unnamed: 0"</span>: <span class="st">"Geographic Area"</span>})</span>
+<span id="cb65-8"><a href="#cb65-8" aria-hidden="true" tabindex="-1"></a>    .convert_dtypes()                 <span class="co"># "smart" converting of columns, use at your own risk</span></span>
+<span id="cb65-9"><a href="#cb65-9" aria-hidden="true" tabindex="-1"></a>    .dropna()                         <span class="co"># we'll introduce this next time</span></span>
+<span id="cb65-10"><a href="#cb65-10" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb65-11"><a href="#cb65-11" aria-hidden="true" tabindex="-1"></a>census_2020s_df[<span class="st">'Geographic Area'</span>] <span class="op">=</span> census_2020s_df[<span class="st">'Geographic Area'</span>].<span class="bu">str</span>.strip(<span class="st">'.'</span>)</span>
+<span id="cb65-12"><a href="#cb65-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb65-13"><a href="#cb65-13" aria-hidden="true" tabindex="-1"></a>census_2020s_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="40">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Geographic Area</th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+<th data-quarto-table-cell-role="th">2022</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>United States</td>
+<td>331511512</td>
+<td>332031554</td>
+<td>333287557</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Northeast</td>
+<td>57448898</td>
+<td>57259257</td>
+<td>57040406</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Midwest</td>
+<td>68961043</td>
+<td>68836505</td>
+<td>68787595</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>South</td>
+<td>126450613</td>
+<td>127346029</td>
+<td>128716192</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>West</td>
+<td>78650958</td>
+<td>78589763</td>
+<td>78743364</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="joining-data-merging-dataframes" class="level3" data-number="5.4.4">
+<h3 data-number="5.4.4" class="anchored" data-anchor-id="joining-data-merging-dataframes"><span class="header-section-number">5.4.4</span> Joining Data (Merging <code>DataFrame</code>s)</h3>
+<p>Time to <code>merge</code>! Here we use the <code>DataFrame</code> method <code>df1.merge(right=df2, ...)</code> on <code>DataFrame</code> <code>df1</code> (<a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html">documentation</a>). Contrast this with the function <code>pd.merge(left=df1, right=df2, ...)</code> (<a href="https://pandas.pydata.org/docs/reference/api/pandas.merge.html?highlight=pandas%20merge#pandas.merge">documentation</a>). Feel free to use either.</p>
+<div id="b17225e7" class="cell" data-execution_count="41">
+<div class="sourceCode cell-code" id="cb66"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb66-1"><a href="#cb66-1" aria-hidden="true" tabindex="-1"></a><span class="co"># merge TB DataFrame with two US census DataFrames</span></span>
+<span id="cb66-2"><a href="#cb66-2" aria-hidden="true" tabindex="-1"></a>tb_census_df <span class="op">=</span> (</span>
+<span id="cb66-3"><a href="#cb66-3" aria-hidden="true" tabindex="-1"></a>    tb_df</span>
+<span id="cb66-4"><a href="#cb66-4" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2010s_df,</span>
+<span id="cb66-5"><a href="#cb66-5" aria-hidden="true" tabindex="-1"></a>           left_on<span class="op">=</span><span class="st">"U.S. jurisdiction"</span>, right_on<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb66-6"><a href="#cb66-6" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2020s_df,</span>
+<span id="cb66-7"><a href="#cb66-7" aria-hidden="true" tabindex="-1"></a>           left_on<span class="op">=</span><span class="st">"U.S. jurisdiction"</span>, right_on<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb66-8"><a href="#cb66-8" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb66-9"><a href="#cb66-9" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="41">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+<th data-quarto-table-cell-role="th">Geographic Area_x</th>
+<th data-quarto-table-cell-role="th">2010</th>
+<th data-quarto-table-cell-role="th">2011</th>
+<th data-quarto-table-cell-role="th">2012</th>
+<th data-quarto-table-cell-role="th">2013</th>
+<th data-quarto-table-cell-role="th">2014</th>
+<th data-quarto-table-cell-role="th">2015</th>
+<th data-quarto-table-cell-role="th">2016</th>
+<th data-quarto-table-cell-role="th">2017</th>
+<th data-quarto-table-cell-role="th">2018</th>
+<th data-quarto-table-cell-role="th">2019</th>
+<th data-quarto-table-cell-role="th">Geographic Area_y</th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+<th data-quarto-table-cell-role="th">2022</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+<td>Alabama</td>
+<td>4785437</td>
+<td>4799069</td>
+<td>4815588</td>
+<td>4830081</td>
+<td>4841799</td>
+<td>4852347</td>
+<td>4863525</td>
+<td>4874486</td>
+<td>4887681</td>
+<td>4903185</td>
+<td>Alabama</td>
+<td>5031362</td>
+<td>5049846</td>
+<td>5074296</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+<td>Alaska</td>
+<td>713910</td>
+<td>722128</td>
+<td>730443</td>
+<td>737068</td>
+<td>736283</td>
+<td>737498</td>
+<td>741456</td>
+<td>739700</td>
+<td>735139</td>
+<td>731545</td>
+<td>Alaska</td>
+<td>732923</td>
+<td>734182</td>
+<td>733583</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+<td>Arizona</td>
+<td>6407172</td>
+<td>6472643</td>
+<td>6554978</td>
+<td>6632764</td>
+<td>6730413</td>
+<td>6829676</td>
+<td>6941072</td>
+<td>7044008</td>
+<td>7158024</td>
+<td>7278717</td>
+<td>Arizona</td>
+<td>7179943</td>
+<td>7264877</td>
+<td>7359197</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+<td>Arkansas</td>
+<td>2921964</td>
+<td>2940667</td>
+<td>2952164</td>
+<td>2959400</td>
+<td>2967392</td>
+<td>2978048</td>
+<td>2989918</td>
+<td>3001345</td>
+<td>3009733</td>
+<td>3017804</td>
+<td>Arkansas</td>
+<td>3014195</td>
+<td>3028122</td>
+<td>3045637</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>California</td>
+<td>2111</td>
+<td>1706</td>
+<td>1750</td>
+<td>5.35</td>
+<td>4.32</td>
+<td>4.46</td>
+<td>California</td>
+<td>37319502</td>
+<td>37638369</td>
+<td>37948800</td>
+<td>38260787</td>
+<td>38596972</td>
+<td>38918045</td>
+<td>39167117</td>
+<td>39358497</td>
+<td>39461588</td>
+<td>39512223</td>
+<td>California</td>
+<td>39501653</td>
+<td>39142991</td>
+<td>39029342</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Having all of these columns is a little unwieldy. We could either drop the unneeded columns now, or just merge on smaller census <code>DataFrame</code>s. Let’s do the latter.</p>
+<div id="70a2b340" class="cell" data-execution_count="42">
+<div class="sourceCode cell-code" id="cb67"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb67-1"><a href="#cb67-1" aria-hidden="true" tabindex="-1"></a><span class="co"># try merging again, but cleaner this time</span></span>
+<span id="cb67-2"><a href="#cb67-2" aria-hidden="true" tabindex="-1"></a>tb_census_df <span class="op">=</span> (</span>
+<span id="cb67-3"><a href="#cb67-3" aria-hidden="true" tabindex="-1"></a>    tb_df</span>
+<span id="cb67-4"><a href="#cb67-4" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2010s_df[[<span class="st">"Geographic Area"</span>, <span class="st">"2019"</span>]],</span>
+<span id="cb67-5"><a href="#cb67-5" aria-hidden="true" tabindex="-1"></a>           left_on<span class="op">=</span><span class="st">"U.S. jurisdiction"</span>, right_on<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb67-6"><a href="#cb67-6" aria-hidden="true" tabindex="-1"></a>    .drop(columns<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb67-7"><a href="#cb67-7" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2020s_df[[<span class="st">"Geographic Area"</span>, <span class="st">"2020"</span>, <span class="st">"2021"</span>]],</span>
+<span id="cb67-8"><a href="#cb67-8" aria-hidden="true" tabindex="-1"></a>           left_on<span class="op">=</span><span class="st">"U.S. jurisdiction"</span>, right_on<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb67-9"><a href="#cb67-9" aria-hidden="true" tabindex="-1"></a>    .drop(columns<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb67-10"><a href="#cb67-10" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb67-11"><a href="#cb67-11" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="42">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+<th data-quarto-table-cell-role="th">2019</th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+<td>4903185</td>
+<td>5031362</td>
+<td>5049846</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+<td>731545</td>
+<td>732923</td>
+<td>734182</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+<td>7278717</td>
+<td>7179943</td>
+<td>7264877</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+<td>3017804</td>
+<td>3014195</td>
+<td>3028122</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>California</td>
+<td>2111</td>
+<td>1706</td>
+<td>1750</td>
+<td>5.35</td>
+<td>4.32</td>
+<td>4.46</td>
+<td>39512223</td>
+<td>39501653</td>
+<td>39142991</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="reproducing-data-compute-incidence" class="level3" data-number="5.4.5">
+<h3 data-number="5.4.5" class="anchored" data-anchor-id="reproducing-data-compute-incidence"><span class="header-section-number">5.4.5</span> Reproducing Data: Compute Incidence</h3>
+<p>Let’s recompute incidence to make sure we know where the original CDC numbers came from.</p>
+<p>From the <a href="https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w#T1_down">CDC report</a>: TB incidence is computed as “Cases per 100,000 persons using mid-year population estimates from the U.S. Census Bureau.”</p>
+<p>If we define a group as 100,000 people, then we can compute the TB incidence for a given state population as</p>
+<p><span class="math display">\[\text{TB incidence} = \frac{\text{TB cases in population}}{\text{groups in population}} = \frac{\text{TB cases in population}}{\text{population}/100000} \]</span></p>
+<p><span class="math display">\[= \frac{\text{TB cases in population}}{\text{population}} \times 100000\]</span></p>
+<p>Let’s try this for 2019:</p>
+<div id="180f0057" class="cell" data-execution_count="43">
+<div class="sourceCode cell-code" id="cb68"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb68-1"><a href="#cb68-1" aria-hidden="true" tabindex="-1"></a>tb_census_df[<span class="st">"recompute incidence 2019"</span>] <span class="op">=</span> tb_census_df[<span class="st">"TB cases 2019"</span>]<span class="op">/</span>tb_census_df[<span class="st">"2019"</span>]<span class="op">*</span><span class="dv">100000</span></span>
+<span id="cb68-2"><a href="#cb68-2" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="43">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+<th data-quarto-table-cell-role="th">2019</th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2019</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+<td>4903185</td>
+<td>5031362</td>
+<td>5049846</td>
+<td>1.77</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+<td>731545</td>
+<td>732923</td>
+<td>734182</td>
+<td>7.93</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+<td>7278717</td>
+<td>7179943</td>
+<td>7264877</td>
+<td>2.51</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+<td>3017804</td>
+<td>3014195</td>
+<td>3028122</td>
+<td>2.12</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>California</td>
+<td>2111</td>
+<td>1706</td>
+<td>1750</td>
+<td>5.35</td>
+<td>4.32</td>
+<td>4.46</td>
+<td>39512223</td>
+<td>39501653</td>
+<td>39142991</td>
+<td>5.34</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Awesome!!!</p>
+<p>Let’s use a for-loop and Python format strings to compute TB incidence for all years. Python f-strings are just used for the purposes of this demo, but they’re handy to know when you explore data beyond this course (<a href="https://docs.python.org/3/tutorial/inputoutput.html">documentation</a>).</p>
+<div id="3cb5568c" class="cell" data-execution_count="44">
+<div class="sourceCode cell-code" id="cb69"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb69-1"><a href="#cb69-1" aria-hidden="true" tabindex="-1"></a><span class="co"># recompute incidence for all years</span></span>
+<span id="cb69-2"><a href="#cb69-2" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> year <span class="kw">in</span> [<span class="dv">2019</span>, <span class="dv">2020</span>, <span class="dv">2021</span>]:</span>
+<span id="cb69-3"><a href="#cb69-3" aria-hidden="true" tabindex="-1"></a>    tb_census_df[<span class="ss">f"recompute incidence </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>] <span class="op">=</span> tb_census_df[<span class="ss">f"TB cases </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>]<span class="op">/</span>tb_census_df[<span class="ss">f"</span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>]<span class="op">*</span><span class="dv">100000</span></span>
+<span id="cb69-4"><a href="#cb69-4" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="44">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+<th data-quarto-table-cell-role="th">2019</th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2019</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2020</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2021</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+<td>4903185</td>
+<td>5031362</td>
+<td>5049846</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.82</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+<td>731545</td>
+<td>732923</td>
+<td>734182</td>
+<td>7.93</td>
+<td>7.91</td>
+<td>7.90</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+<td>7278717</td>
+<td>7179943</td>
+<td>7264877</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.78</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+<td>3017804</td>
+<td>3014195</td>
+<td>3028122</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>California</td>
+<td>2111</td>
+<td>1706</td>
+<td>1750</td>
+<td>5.35</td>
+<td>4.32</td>
+<td>4.46</td>
+<td>39512223</td>
+<td>39501653</td>
+<td>39142991</td>
+<td>5.34</td>
+<td>4.32</td>
+<td>4.47</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>These numbers look pretty close!!! There are a few errors in the hundredths place, particularly in 2021. It may be useful to further explore reasons behind this discrepancy.</p>
+<div id="bf145347" class="cell" data-execution_count="45">
+<div class="sourceCode cell-code" id="cb70"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb70-1"><a href="#cb70-1" aria-hidden="true" tabindex="-1"></a>tb_census_df.describe()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="45">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+<th data-quarto-table-cell-role="th">2019</th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2019</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2020</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2021</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">count</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+<td>51.00</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">mean</td>
+<td>174.51</td>
+<td>140.65</td>
+<td>154.12</td>
+<td>2.10</td>
+<td>1.78</td>
+<td>1.97</td>
+<td>6436069.08</td>
+<td>6500225.73</td>
+<td>6510422.63</td>
+<td>2.10</td>
+<td>1.78</td>
+<td>1.97</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">std</td>
+<td>341.74</td>
+<td>271.06</td>
+<td>286.78</td>
+<td>1.50</td>
+<td>1.34</td>
+<td>1.48</td>
+<td>7360660.47</td>
+<td>7408168.46</td>
+<td>7394300.08</td>
+<td>1.50</td>
+<td>1.34</td>
+<td>1.47</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">min</td>
+<td>1.00</td>
+<td>0.00</td>
+<td>2.00</td>
+<td>0.17</td>
+<td>0.00</td>
+<td>0.21</td>
+<td>578759.00</td>
+<td>577605.00</td>
+<td>579483.00</td>
+<td>0.17</td>
+<td>0.00</td>
+<td>0.21</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">25%</td>
+<td>25.50</td>
+<td>29.00</td>
+<td>23.00</td>
+<td>1.29</td>
+<td>1.21</td>
+<td>1.23</td>
+<td>1789606.00</td>
+<td>1820311.00</td>
+<td>1844920.00</td>
+<td>1.30</td>
+<td>1.21</td>
+<td>1.23</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">50%</td>
+<td>70.00</td>
+<td>67.00</td>
+<td>69.00</td>
+<td>1.80</td>
+<td>1.52</td>
+<td>1.70</td>
+<td>4467673.00</td>
+<td>4507445.00</td>
+<td>4506589.00</td>
+<td>1.81</td>
+<td>1.52</td>
+<td>1.69</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">75%</td>
+<td>180.50</td>
+<td>139.00</td>
+<td>150.00</td>
+<td>2.58</td>
+<td>1.99</td>
+<td>2.22</td>
+<td>7446805.00</td>
+<td>7451987.00</td>
+<td>7502811.00</td>
+<td>2.58</td>
+<td>1.99</td>
+<td>2.22</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">max</td>
+<td>2111.00</td>
+<td>1706.00</td>
+<td>1750.00</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+<td>39512223.00</td>
+<td>39501653.00</td>
+<td>39142991.00</td>
+<td>7.93</td>
+<td>7.91</td>
+<td>7.90</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="bonus-eda-reproducing-the-reported-statistic" class="level3" data-number="5.4.6">
+<h3 data-number="5.4.6" class="anchored" data-anchor-id="bonus-eda-reproducing-the-reported-statistic"><span class="header-section-number">5.4.6</span> Bonus EDA: Reproducing the Reported Statistic</h3>
+<p><strong>How do we reproduce that reported statistic in the original <a href="https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w">CDC report</a>?</strong></p>
+<blockquote class="blockquote">
+<p>Reported TB incidence (cases per 100,000 persons) increased <strong>9.4%</strong>, from <strong>2.2</strong> during 2020 to <strong>2.4</strong> during 2021 but was lower than incidence during 2019 (2.7). Increases occurred among both U.S.-born and non–U.S.-born persons.</p>
+</blockquote>
+<p>This is TB incidence computed across the entire U.S. population! How do we reproduce this? * We need to reproduce the “Total” TB incidences in our rolled record. * But our current <code>tb_census_df</code> only has 51 entries (50 states plus Washington, D.C.). There is no rolled record. * What happened…?</p>
+<p>Let’s get exploring!</p>
+<p>Before we keep exploring, we’ll set all indexes to more meaningful values, instead of just numbers that pertain to some row at some point. This will make our cleaning slightly easier.</p>
+<div id="0b3bc963" class="cell" data-execution_count="46">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb71"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb71-1"><a href="#cb71-1" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> tb_df.set_index(<span class="st">"U.S. jurisdiction"</span>)</span>
+<span id="cb71-2"><a href="#cb71-2" aria-hidden="true" tabindex="-1"></a>tb_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="46">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Total</td>
+<td>8900</td>
+<td>7173</td>
+<td>7860</td>
+<td>2.71</td>
+<td>2.16</td>
+<td>2.37</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="f41690e7" class="cell" data-execution_count="47">
+<div class="sourceCode cell-code" id="cb72"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb72-1"><a href="#cb72-1" aria-hidden="true" tabindex="-1"></a>census_2010s_df <span class="op">=</span> census_2010s_df.set_index(<span class="st">"Geographic Area"</span>)</span>
+<span id="cb72-2"><a href="#cb72-2" aria-hidden="true" tabindex="-1"></a>census_2010s_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="47">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">2010</th>
+<th data-quarto-table-cell-role="th">2011</th>
+<th data-quarto-table-cell-role="th">2012</th>
+<th data-quarto-table-cell-role="th">2013</th>
+<th data-quarto-table-cell-role="th">2014</th>
+<th data-quarto-table-cell-role="th">2015</th>
+<th data-quarto-table-cell-role="th">2016</th>
+<th data-quarto-table-cell-role="th">2017</th>
+<th data-quarto-table-cell-role="th">2018</th>
+<th data-quarto-table-cell-role="th">2019</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Geographic Area</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">United States</td>
+<td>309321666</td>
+<td>311556874</td>
+<td>313830990</td>
+<td>315993715</td>
+<td>318301008</td>
+<td>320635163</td>
+<td>322941311</td>
+<td>324985539</td>
+<td>326687501</td>
+<td>328239523</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Northeast</td>
+<td>55380134</td>
+<td>55604223</td>
+<td>55775216</td>
+<td>55901806</td>
+<td>56006011</td>
+<td>56034684</td>
+<td>56042330</td>
+<td>56059240</td>
+<td>56046620</td>
+<td>55982803</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Midwest</td>
+<td>66974416</td>
+<td>67157800</td>
+<td>67336743</td>
+<td>67560379</td>
+<td>67745167</td>
+<td>67860583</td>
+<td>67987540</td>
+<td>68126781</td>
+<td>68236628</td>
+<td>68329004</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">South</td>
+<td>114866680</td>
+<td>116006522</td>
+<td>117241208</td>
+<td>118364400</td>
+<td>119624037</td>
+<td>120997341</td>
+<td>122351760</td>
+<td>123542189</td>
+<td>124569433</td>
+<td>125580448</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">West</td>
+<td>72100436</td>
+<td>72788329</td>
+<td>73477823</td>
+<td>74167130</td>
+<td>74925793</td>
+<td>75742555</td>
+<td>76559681</td>
+<td>77257329</td>
+<td>77834820</td>
+<td>78347268</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="2f13f665" class="cell" data-execution_count="48">
+<div class="sourceCode cell-code" id="cb73"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb73-1"><a href="#cb73-1" aria-hidden="true" tabindex="-1"></a>census_2020s_df <span class="op">=</span> census_2020s_df.set_index(<span class="st">"Geographic Area"</span>)</span>
+<span id="cb73-2"><a href="#cb73-2" aria-hidden="true" tabindex="-1"></a>census_2020s_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="48">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+<th data-quarto-table-cell-role="th">2022</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Geographic Area</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">United States</td>
+<td>331511512</td>
+<td>332031554</td>
+<td>333287557</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Northeast</td>
+<td>57448898</td>
+<td>57259257</td>
+<td>57040406</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Midwest</td>
+<td>68961043</td>
+<td>68836505</td>
+<td>68787595</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">South</td>
+<td>126450613</td>
+<td>127346029</td>
+<td>128716192</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">West</td>
+<td>78650958</td>
+<td>78589763</td>
+<td>78743364</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>It turns out that our merge above only kept state records, even though our original <code>tb_df</code> had the “Total” rolled record:</p>
+<div id="07cac5dc" class="cell" data-execution_count="49">
+<div class="sourceCode cell-code" id="cb74"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb74-1"><a href="#cb74-1" aria-hidden="true" tabindex="-1"></a>tb_df.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="49">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">U.S. jurisdiction</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Total</td>
+<td>8900</td>
+<td>7173</td>
+<td>7860</td>
+<td>2.71</td>
+<td>2.16</td>
+<td>2.37</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Recall that <code>merge</code> by default does an <strong>inner</strong> merge by default, meaning that it only preserves keys that are present in <strong>both</strong> <code>DataFrame</code>s.</p>
+<p>The rolled records in our census <code>DataFrame</code> have different <code>Geographic Area</code> fields, which was the key we merged on:</p>
+<div id="ba8d2a7e" class="cell" data-execution_count="50">
+<div class="sourceCode cell-code" id="cb75"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb75-1"><a href="#cb75-1" aria-hidden="true" tabindex="-1"></a>census_2010s_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="50">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">2010</th>
+<th data-quarto-table-cell-role="th">2011</th>
+<th data-quarto-table-cell-role="th">2012</th>
+<th data-quarto-table-cell-role="th">2013</th>
+<th data-quarto-table-cell-role="th">2014</th>
+<th data-quarto-table-cell-role="th">2015</th>
+<th data-quarto-table-cell-role="th">2016</th>
+<th data-quarto-table-cell-role="th">2017</th>
+<th data-quarto-table-cell-role="th">2018</th>
+<th data-quarto-table-cell-role="th">2019</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Geographic Area</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">United States</td>
+<td>309321666</td>
+<td>311556874</td>
+<td>313830990</td>
+<td>315993715</td>
+<td>318301008</td>
+<td>320635163</td>
+<td>322941311</td>
+<td>324985539</td>
+<td>326687501</td>
+<td>328239523</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Northeast</td>
+<td>55380134</td>
+<td>55604223</td>
+<td>55775216</td>
+<td>55901806</td>
+<td>56006011</td>
+<td>56034684</td>
+<td>56042330</td>
+<td>56059240</td>
+<td>56046620</td>
+<td>55982803</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Midwest</td>
+<td>66974416</td>
+<td>67157800</td>
+<td>67336743</td>
+<td>67560379</td>
+<td>67745167</td>
+<td>67860583</td>
+<td>67987540</td>
+<td>68126781</td>
+<td>68236628</td>
+<td>68329004</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">South</td>
+<td>114866680</td>
+<td>116006522</td>
+<td>117241208</td>
+<td>118364400</td>
+<td>119624037</td>
+<td>120997341</td>
+<td>122351760</td>
+<td>123542189</td>
+<td>124569433</td>
+<td>125580448</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">West</td>
+<td>72100436</td>
+<td>72788329</td>
+<td>73477823</td>
+<td>74167130</td>
+<td>74925793</td>
+<td>75742555</td>
+<td>76559681</td>
+<td>77257329</td>
+<td>77834820</td>
+<td>78347268</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>The Census <code>DataFrame</code> has several rolled records. The aggregate record we are looking for actually has the Geographic Area named “United States”.</p>
+<p>One straightforward way to get the right merge is to rename the value itself. Because we now have the Geographic Area index, we’ll use <code>df.rename()</code> (<a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html">documentation</a>):</p>
+<div id="abff6fde" class="cell" data-execution_count="51">
+<div class="sourceCode cell-code" id="cb76"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb76-1"><a href="#cb76-1" aria-hidden="true" tabindex="-1"></a><span class="co"># rename rolled record for 2010s</span></span>
+<span id="cb76-2"><a href="#cb76-2" aria-hidden="true" tabindex="-1"></a>census_2010s_df.rename(index<span class="op">=</span>{<span class="st">'United States'</span>:<span class="st">'Total'</span>}, inplace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb76-3"><a href="#cb76-3" aria-hidden="true" tabindex="-1"></a>census_2010s_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="51">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">2010</th>
+<th data-quarto-table-cell-role="th">2011</th>
+<th data-quarto-table-cell-role="th">2012</th>
+<th data-quarto-table-cell-role="th">2013</th>
+<th data-quarto-table-cell-role="th">2014</th>
+<th data-quarto-table-cell-role="th">2015</th>
+<th data-quarto-table-cell-role="th">2016</th>
+<th data-quarto-table-cell-role="th">2017</th>
+<th data-quarto-table-cell-role="th">2018</th>
+<th data-quarto-table-cell-role="th">2019</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Geographic Area</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Total</td>
+<td>309321666</td>
+<td>311556874</td>
+<td>313830990</td>
+<td>315993715</td>
+<td>318301008</td>
+<td>320635163</td>
+<td>322941311</td>
+<td>324985539</td>
+<td>326687501</td>
+<td>328239523</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Northeast</td>
+<td>55380134</td>
+<td>55604223</td>
+<td>55775216</td>
+<td>55901806</td>
+<td>56006011</td>
+<td>56034684</td>
+<td>56042330</td>
+<td>56059240</td>
+<td>56046620</td>
+<td>55982803</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Midwest</td>
+<td>66974416</td>
+<td>67157800</td>
+<td>67336743</td>
+<td>67560379</td>
+<td>67745167</td>
+<td>67860583</td>
+<td>67987540</td>
+<td>68126781</td>
+<td>68236628</td>
+<td>68329004</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">South</td>
+<td>114866680</td>
+<td>116006522</td>
+<td>117241208</td>
+<td>118364400</td>
+<td>119624037</td>
+<td>120997341</td>
+<td>122351760</td>
+<td>123542189</td>
+<td>124569433</td>
+<td>125580448</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">West</td>
+<td>72100436</td>
+<td>72788329</td>
+<td>73477823</td>
+<td>74167130</td>
+<td>74925793</td>
+<td>75742555</td>
+<td>76559681</td>
+<td>77257329</td>
+<td>77834820</td>
+<td>78347268</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="49c8233f" class="cell" data-execution_count="52">
+<div class="sourceCode cell-code" id="cb77"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb77-1"><a href="#cb77-1" aria-hidden="true" tabindex="-1"></a><span class="co"># same, but for 2020s rename rolled record</span></span>
+<span id="cb77-2"><a href="#cb77-2" aria-hidden="true" tabindex="-1"></a>census_2020s_df.rename(index<span class="op">=</span>{<span class="st">'United States'</span>:<span class="st">'Total'</span>}, inplace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb77-3"><a href="#cb77-3" aria-hidden="true" tabindex="-1"></a>census_2020s_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="52">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+<th data-quarto-table-cell-role="th">2022</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Geographic Area</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Total</td>
+<td>331511512</td>
+<td>332031554</td>
+<td>333287557</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Northeast</td>
+<td>57448898</td>
+<td>57259257</td>
+<td>57040406</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Midwest</td>
+<td>68961043</td>
+<td>68836505</td>
+<td>68787595</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">South</td>
+<td>126450613</td>
+<td>127346029</td>
+<td>128716192</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">West</td>
+<td>78650958</td>
+<td>78589763</td>
+<td>78743364</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p><br></p>
+<p>Next let’s rerun our merge. Note the different chaining, because we are now merging on indexes (<code>df.merge()</code> <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html">documentation</a>).</p>
+<div id="bf67812c" class="cell" data-execution_count="53">
+<div class="sourceCode cell-code" id="cb78"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb78-1"><a href="#cb78-1" aria-hidden="true" tabindex="-1"></a>tb_census_df <span class="op">=</span> (</span>
+<span id="cb78-2"><a href="#cb78-2" aria-hidden="true" tabindex="-1"></a>    tb_df</span>
+<span id="cb78-3"><a href="#cb78-3" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2010s_df[[<span class="st">"2019"</span>]],</span>
+<span id="cb78-4"><a href="#cb78-4" aria-hidden="true" tabindex="-1"></a>           left_index<span class="op">=</span><span class="va">True</span>, right_index<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb78-5"><a href="#cb78-5" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2020s_df[[<span class="st">"2020"</span>, <span class="st">"2021"</span>]],</span>
+<span id="cb78-6"><a href="#cb78-6" aria-hidden="true" tabindex="-1"></a>           left_index<span class="op">=</span><span class="va">True</span>, right_index<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb78-7"><a href="#cb78-7" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb78-8"><a href="#cb78-8" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="53">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+<th data-quarto-table-cell-role="th">2019</th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Total</td>
+<td>8900</td>
+<td>7173</td>
+<td>7860</td>
+<td>2.71</td>
+<td>2.16</td>
+<td>2.37</td>
+<td>328239523</td>
+<td>331511512</td>
+<td>332031554</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+<td>4903185</td>
+<td>5031362</td>
+<td>5049846</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+<td>731545</td>
+<td>732923</td>
+<td>734182</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+<td>7278717</td>
+<td>7179943</td>
+<td>7264877</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+<td>3017804</td>
+<td>3014195</td>
+<td>3028122</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p><br></p>
+<p>Finally, let’s recompute our incidences:</p>
+<div id="62cb3bae" class="cell" data-execution_count="54">
+<div class="sourceCode cell-code" id="cb79"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb79-1"><a href="#cb79-1" aria-hidden="true" tabindex="-1"></a><span class="co"># recompute incidence for all years</span></span>
+<span id="cb79-2"><a href="#cb79-2" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> year <span class="kw">in</span> [<span class="dv">2019</span>, <span class="dv">2020</span>, <span class="dv">2021</span>]:</span>
+<span id="cb79-3"><a href="#cb79-3" aria-hidden="true" tabindex="-1"></a>    tb_census_df[<span class="ss">f"recompute incidence </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>] <span class="op">=</span> tb_census_df[<span class="ss">f"TB cases </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>]<span class="op">/</span>tb_census_df[<span class="ss">f"</span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>]<span class="op">*</span><span class="dv">100000</span></span>
+<span id="cb79-4"><a href="#cb79-4" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="54">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">TB cases 2019</th>
+<th data-quarto-table-cell-role="th">TB cases 2020</th>
+<th data-quarto-table-cell-role="th">TB cases 2021</th>
+<th data-quarto-table-cell-role="th">TB incidence 2019</th>
+<th data-quarto-table-cell-role="th">TB incidence 2020</th>
+<th data-quarto-table-cell-role="th">TB incidence 2021</th>
+<th data-quarto-table-cell-role="th">2019</th>
+<th data-quarto-table-cell-role="th">2020</th>
+<th data-quarto-table-cell-role="th">2021</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2019</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2020</th>
+<th data-quarto-table-cell-role="th">recompute incidence 2021</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Total</td>
+<td>8900</td>
+<td>7173</td>
+<td>7860</td>
+<td>2.71</td>
+<td>2.16</td>
+<td>2.37</td>
+<td>328239523</td>
+<td>331511512</td>
+<td>332031554</td>
+<td>2.71</td>
+<td>2.16</td>
+<td>2.37</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Alabama</td>
+<td>87</td>
+<td>72</td>
+<td>92</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.83</td>
+<td>4903185</td>
+<td>5031362</td>
+<td>5049846</td>
+<td>1.77</td>
+<td>1.43</td>
+<td>1.82</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Alaska</td>
+<td>58</td>
+<td>58</td>
+<td>58</td>
+<td>7.91</td>
+<td>7.92</td>
+<td>7.92</td>
+<td>731545</td>
+<td>732923</td>
+<td>734182</td>
+<td>7.93</td>
+<td>7.91</td>
+<td>7.90</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Arizona</td>
+<td>183</td>
+<td>136</td>
+<td>129</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.77</td>
+<td>7278717</td>
+<td>7179943</td>
+<td>7264877</td>
+<td>2.51</td>
+<td>1.89</td>
+<td>1.78</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Arkansas</td>
+<td>64</td>
+<td>59</td>
+<td>69</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+<td>3017804</td>
+<td>3014195</td>
+<td>3028122</td>
+<td>2.12</td>
+<td>1.96</td>
+<td>2.28</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We reproduced the total U.S. incidences correctly!</p>
+<p>We’re almost there. Let’s revisit the quote:</p>
+<blockquote class="blockquote">
+<p>Reported TB incidence (cases per 100,000 persons) increased <strong>9.4%</strong>, from <strong>2.2</strong> during 2020 to <strong>2.4</strong> during 2021 but was lower than incidence during 2019 (2.7). Increases occurred among both U.S.-born and non–U.S.-born persons.</p>
+</blockquote>
+<p>Recall that percent change from <span class="math inline">\(A\)</span> to <span class="math inline">\(B\)</span> is computed as <span class="math inline">\(\text{percent change} = \frac{B - A}{A} \times 100\)</span>.</p>
+<div id="0b7f3861" class="cell" data-execution_count="55">
+<div class="sourceCode cell-code" id="cb80"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb80-1"><a href="#cb80-1" aria-hidden="true" tabindex="-1"></a>incidence_2020 <span class="op">=</span> tb_census_df.loc[<span class="st">'Total'</span>, <span class="st">'recompute incidence 2020'</span>]</span>
+<span id="cb80-2"><a href="#cb80-2" aria-hidden="true" tabindex="-1"></a>incidence_2020</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="55">
+<pre><code>np.float64(2.1637257652759883)</code></pre>
+</div>
+</div>
+<div id="c048e469" class="cell" data-execution_count="56">
+<div class="sourceCode cell-code" id="cb82"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb82-1"><a href="#cb82-1" aria-hidden="true" tabindex="-1"></a>incidence_2021 <span class="op">=</span> tb_census_df.loc[<span class="st">'Total'</span>, <span class="st">'recompute incidence 2021'</span>]</span>
+<span id="cb82-2"><a href="#cb82-2" aria-hidden="true" tabindex="-1"></a>incidence_2021</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="56">
+<pre><code>np.float64(2.3672448914298068)</code></pre>
+</div>
+</div>
+<div id="978174a6" class="cell" data-execution_count="57">
+<div class="sourceCode cell-code" id="cb84"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb84-1"><a href="#cb84-1" aria-hidden="true" tabindex="-1"></a>difference <span class="op">=</span> (incidence_2021 <span class="op">-</span> incidence_2020)<span class="op">/</span>incidence_2020 <span class="op">*</span> <span class="dv">100</span></span>
+<span id="cb84-2"><a href="#cb84-2" aria-hidden="true" tabindex="-1"></a>difference</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="57">
+<pre><code>np.float64(9.405957511804143)</code></pre>
+</div>
+</div>
+</section>
+</section>
+<section id="eda-demo-2-mauna-loa-co2-data-a-lesson-in-data-faithfulness" class="level2" data-number="5.5">
+<h2 data-number="5.5" class="anchored" data-anchor-id="eda-demo-2-mauna-loa-co2-data-a-lesson-in-data-faithfulness"><span class="header-section-number">5.5</span> EDA Demo 2: Mauna Loa CO<sub>2</sub> Data – A Lesson in Data Faithfulness</h2>
+<p><a href="https://gml.noaa.gov/ccgg/trends/data.html">Mauna Loa Observatory</a> has been monitoring CO<sub>2</sub> concentrations since 1958.</p>
+<div id="1fa51758" class="cell" data-execution_count="58">
+<div class="sourceCode cell-code" id="cb86"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb86-1"><a href="#cb86-1" aria-hidden="true" tabindex="-1"></a>co2_file <span class="op">=</span> <span class="st">"data/co2_mm_mlo.txt"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Let’s do some <strong>EDA</strong>!!</p>
+<section id="reading-this-file-into-pandas" class="level3" data-number="5.5.1">
+<h3 data-number="5.5.1" class="anchored" data-anchor-id="reading-this-file-into-pandas"><span class="header-section-number">5.5.1</span> Reading this file into <code>Pandas</code>?</h3>
+<p>Let’s instead check out this <code>.txt</code> file. Some questions to keep in mind: Do we trust this file extension? What structure is it?</p>
+<p>Lines 71-78 (inclusive) are shown below:</p>
+<pre><code>line number |                            file contents
+
+71          |   #            decimal     average   interpolated    trend    #days
+72          |   #             date                             (season corr)
+73          |   1958   3    1958.208      315.71      315.71      314.62     -1
+74          |   1958   4    1958.292      317.45      317.45      315.29     -1
+75          |   1958   5    1958.375      317.50      317.50      314.71     -1
+76          |   1958   6    1958.458      -99.99      317.10      314.85     -1
+77          |   1958   7    1958.542      315.86      315.86      314.98     -1
+78          |   1958   8    1958.625      314.93      314.93      315.94     -1</code></pre>
+<p>Notice how:</p>
+<ul>
+<li>The values are separated by white space, possibly tabs.</li>
+<li>The data line up down the rows. For example, the month appears in 7th to 8th position of each line.</li>
+<li>The 71st and 72nd lines in the file contain column headings split over two lines.</li>
+</ul>
+<p>We can use&nbsp;<code>read_csv</code>&nbsp;to read the data into a <code>pandas</code> <code>DataFrame</code>, and we provide several arguments to specify that the separators are white space, there is no header (<strong>we will set our own column names</strong>), and to skip the first 72 rows of the file.</p>
+<div id="579a2525" class="cell" data-execution_count="59">
+<div class="sourceCode cell-code" id="cb88"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb88-1"><a href="#cb88-1" aria-hidden="true" tabindex="-1"></a>co2 <span class="op">=</span> pd.read_csv(</span>
+<span id="cb88-2"><a href="#cb88-2" aria-hidden="true" tabindex="-1"></a>    co2_file, header <span class="op">=</span> <span class="va">None</span>, skiprows <span class="op">=</span> <span class="dv">72</span>,</span>
+<span id="cb88-3"><a href="#cb88-3" aria-hidden="true" tabindex="-1"></a>    sep <span class="op">=</span> <span class="vs">r'\s+'</span>       <span class="co">#delimiter for continuous whitespace (stay tuned for regex next lecture))</span></span>
+<span id="cb88-4"><a href="#cb88-4" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb88-5"><a href="#cb88-5" aria-hidden="true" tabindex="-1"></a>co2.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="59">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+<th data-quarto-table-cell-role="th">1</th>
+<th data-quarto-table-cell-role="th">2</th>
+<th data-quarto-table-cell-role="th">3</th>
+<th data-quarto-table-cell-role="th">4</th>
+<th data-quarto-table-cell-role="th">5</th>
+<th data-quarto-table-cell-role="th">6</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1958</td>
+<td>3</td>
+<td>1958.21</td>
+<td>315.71</td>
+<td>315.71</td>
+<td>314.62</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1958</td>
+<td>4</td>
+<td>1958.29</td>
+<td>317.45</td>
+<td>317.45</td>
+<td>315.29</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1958</td>
+<td>5</td>
+<td>1958.38</td>
+<td>317.50</td>
+<td>317.50</td>
+<td>314.71</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1958</td>
+<td>6</td>
+<td>1958.46</td>
+<td>-99.99</td>
+<td>317.10</td>
+<td>314.85</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1958</td>
+<td>7</td>
+<td>1958.54</td>
+<td>315.86</td>
+<td>315.86</td>
+<td>314.98</td>
+<td>-1</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Congratulations! You’ve wrangled the data!</p>
+<p><br></p>
+<p>…But our columns aren’t named. <strong>We need to do more EDA.</strong></p>
+</section>
+<section id="exploring-variable-feature-types" class="level3" data-number="5.5.2">
+<h3 data-number="5.5.2" class="anchored" data-anchor-id="exploring-variable-feature-types"><span class="header-section-number">5.5.2</span> Exploring Variable Feature Types</h3>
+<p>The NOAA <a href="https://gml.noaa.gov/ccgg/trends/">webpage</a> might have some useful tidbits (in this case it doesn’t).</p>
+<p>Using this information, we’ll rerun <code>pd.read_csv</code>, but this time with some <strong>custom column names.</strong></p>
+<div id="7169b89b" class="cell" data-execution_count="60">
+<div class="sourceCode cell-code" id="cb89"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb89-1"><a href="#cb89-1" aria-hidden="true" tabindex="-1"></a>co2 <span class="op">=</span> pd.read_csv(</span>
+<span id="cb89-2"><a href="#cb89-2" aria-hidden="true" tabindex="-1"></a>    co2_file, header <span class="op">=</span> <span class="va">None</span>, skiprows <span class="op">=</span> <span class="dv">72</span>,</span>
+<span id="cb89-3"><a href="#cb89-3" aria-hidden="true" tabindex="-1"></a>    sep <span class="op">=</span> <span class="st">'\s+'</span>, <span class="co">#regex for continuous whitespace (next lecture)</span></span>
+<span id="cb89-4"><a href="#cb89-4" aria-hidden="true" tabindex="-1"></a>    names <span class="op">=</span> [<span class="st">'Yr'</span>, <span class="st">'Mo'</span>, <span class="st">'DecDate'</span>, <span class="st">'Avg'</span>, <span class="st">'Int'</span>, <span class="st">'Trend'</span>, <span class="st">'Days'</span>]</span>
+<span id="cb89-5"><a href="#cb89-5" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb89-6"><a href="#cb89-6" aria-hidden="true" tabindex="-1"></a>co2.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="60">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Yr</th>
+<th data-quarto-table-cell-role="th">Mo</th>
+<th data-quarto-table-cell-role="th">DecDate</th>
+<th data-quarto-table-cell-role="th">Avg</th>
+<th data-quarto-table-cell-role="th">Int</th>
+<th data-quarto-table-cell-role="th">Trend</th>
+<th data-quarto-table-cell-role="th">Days</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1958</td>
+<td>3</td>
+<td>1958.21</td>
+<td>315.71</td>
+<td>315.71</td>
+<td>314.62</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1958</td>
+<td>4</td>
+<td>1958.29</td>
+<td>317.45</td>
+<td>317.45</td>
+<td>315.29</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1958</td>
+<td>5</td>
+<td>1958.38</td>
+<td>317.50</td>
+<td>317.50</td>
+<td>314.71</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1958</td>
+<td>6</td>
+<td>1958.46</td>
+<td>-99.99</td>
+<td>317.10</td>
+<td>314.85</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1958</td>
+<td>7</td>
+<td>1958.54</td>
+<td>315.86</td>
+<td>315.86</td>
+<td>314.98</td>
+<td>-1</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="visualizing-co2" class="level3" data-number="5.5.3">
+<h3 data-number="5.5.3" class="anchored" data-anchor-id="visualizing-co2"><span class="header-section-number">5.5.3</span> Visualizing CO<sub>2</sub></h3>
+<p>Scientific studies tend to have very clean data, right…? Let’s jump right in and make a time series plot of CO<sub>2</sub> monthly averages.</p>
+<div id="4f846870" class="cell" data-execution_count="61">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb90"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb90-1"><a href="#cb90-1" aria-hidden="true" tabindex="-1"></a>sns.lineplot(x<span class="op">=</span><span class="st">'DecDate'</span>, y<span class="op">=</span><span class="st">'Avg'</span>, data<span class="op">=</span>co2)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="eda_files/figure-html/cell-62-output-1.png" width="1006" height="749" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>The code above uses the <code>seaborn</code> plotting library (abbreviated <code>sns</code>). We will cover this in the Visualization lecture, but now you don’t need to worry about how it works!</p>
+<p>Yikes! Plotting the data uncovered a problem. The sharp vertical lines suggest that we have some <strong>missing values</strong>. What happened here?</p>
+<div id="8c59d480" class="cell" data-execution_count="62">
+<div class="sourceCode cell-code" id="cb91"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb91-1"><a href="#cb91-1" aria-hidden="true" tabindex="-1"></a>co2.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="62">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Yr</th>
+<th data-quarto-table-cell-role="th">Mo</th>
+<th data-quarto-table-cell-role="th">DecDate</th>
+<th data-quarto-table-cell-role="th">Avg</th>
+<th data-quarto-table-cell-role="th">Int</th>
+<th data-quarto-table-cell-role="th">Trend</th>
+<th data-quarto-table-cell-role="th">Days</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1958</td>
+<td>3</td>
+<td>1958.21</td>
+<td>315.71</td>
+<td>315.71</td>
+<td>314.62</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1958</td>
+<td>4</td>
+<td>1958.29</td>
+<td>317.45</td>
+<td>317.45</td>
+<td>315.29</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1958</td>
+<td>5</td>
+<td>1958.38</td>
+<td>317.50</td>
+<td>317.50</td>
+<td>314.71</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1958</td>
+<td>6</td>
+<td>1958.46</td>
+<td>-99.99</td>
+<td>317.10</td>
+<td>314.85</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1958</td>
+<td>7</td>
+<td>1958.54</td>
+<td>315.86</td>
+<td>315.86</td>
+<td>314.98</td>
+<td>-1</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="de177be6" class="cell" data-execution_count="63">
+<div class="sourceCode cell-code" id="cb92"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb92-1"><a href="#cb92-1" aria-hidden="true" tabindex="-1"></a>co2.tail()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="63">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Yr</th>
+<th data-quarto-table-cell-role="th">Mo</th>
+<th data-quarto-table-cell-role="th">DecDate</th>
+<th data-quarto-table-cell-role="th">Avg</th>
+<th data-quarto-table-cell-role="th">Int</th>
+<th data-quarto-table-cell-role="th">Trend</th>
+<th data-quarto-table-cell-role="th">Days</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">733</td>
+<td>2019</td>
+<td>4</td>
+<td>2019.29</td>
+<td>413.32</td>
+<td>413.32</td>
+<td>410.49</td>
+<td>26</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">734</td>
+<td>2019</td>
+<td>5</td>
+<td>2019.38</td>
+<td>414.66</td>
+<td>414.66</td>
+<td>411.20</td>
+<td>28</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">735</td>
+<td>2019</td>
+<td>6</td>
+<td>2019.46</td>
+<td>413.92</td>
+<td>413.92</td>
+<td>411.58</td>
+<td>27</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">736</td>
+<td>2019</td>
+<td>7</td>
+<td>2019.54</td>
+<td>411.77</td>
+<td>411.77</td>
+<td>411.43</td>
+<td>23</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">737</td>
+<td>2019</td>
+<td>8</td>
+<td>2019.62</td>
+<td>409.95</td>
+<td>409.95</td>
+<td>411.84</td>
+<td>29</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Some data have unusual values like -1 and -99.99.</p>
+<p>Let’s check the description at the top of the file again.</p>
+<ul>
+<li>-1 signifies a missing value for the number of days <code>Days</code> the equipment was in operation that month.</li>
+<li>-99.99 denotes a missing monthly average <code>Avg</code></li>
+</ul>
+<p>How can we fix this? First, let’s explore other aspects of our data. Understanding our data will help us decide what to do with the missing values.</p>
+<p><br></p>
+</section>
+<section id="sanity-checks-reasoning-about-the-data" class="level3" data-number="5.5.4">
+<h3 data-number="5.5.4" class="anchored" data-anchor-id="sanity-checks-reasoning-about-the-data"><span class="header-section-number">5.5.4</span> Sanity Checks: Reasoning about the data</h3>
+<p>First, we consider the shape of the data. How many rows should we have?</p>
+<ul>
+<li>If chronological order, we should have one record per month.</li>
+<li>Data from March 1958 to August 2019.</li>
+<li>We should have $ 12 (2019-1957) - 2 - 4 = 738 $ records.</li>
+</ul>
+<div id="9a611a5b" class="cell" data-execution_count="64">
+<div class="sourceCode cell-code" id="cb93"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb93-1"><a href="#cb93-1" aria-hidden="true" tabindex="-1"></a>co2.shape</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="64">
+<pre><code>(738, 7)</code></pre>
+</div>
+</div>
+<p>Nice!! The number of rows (i.e.&nbsp;records) match our expectations.</p>
+<p>Let’s now check the quality of each feature.</p>
+</section>
+<section id="understanding-missing-value-1-days" class="level3" data-number="5.5.5">
+<h3 data-number="5.5.5" class="anchored" data-anchor-id="understanding-missing-value-1-days"><span class="header-section-number">5.5.5</span> Understanding Missing Value 1: <code>Days</code></h3>
+<p><code>Days</code> is a time field, so let’s analyze other time fields to see if there is an explanation for missing values of days of operation.</p>
+<p>Let’s start with <strong>months</strong>, <code>Mo</code>.</p>
+<p>Are we missing any records? The number of months should have 62 or 61 instances (March 1957-August 2019).</p>
+<div id="21ccc3ed" class="cell" data-execution_count="65">
+<div class="sourceCode cell-code" id="cb95"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb95-1"><a href="#cb95-1" aria-hidden="true" tabindex="-1"></a>co2[<span class="st">"Mo"</span>].value_counts().sort_index()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="65">
+<pre><code>Mo
+1     61
+2     61
+3     62
+4     62
+5     62
+6     62
+7     62
+8     62
+9     61
+10    61
+11    61
+12    61
+Name: count, dtype: int64</code></pre>
+</div>
+</div>
+<p>As expected Jan, Feb, Sep, Oct, Nov, and Dec have 61 occurrences and the rest 62.</p>
+<p><br></p>
+<p>Next let’s explore <strong>days</strong> <code>Days</code> itself, which is the number of days that the measurement equipment worked.</p>
+<div id="9af31de9" class="cell" data-execution_count="66">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb97"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb97-1"><a href="#cb97-1" aria-hidden="true" tabindex="-1"></a>sns.displot(co2[<span class="st">'Days'</span>])<span class="op">;</span></span>
+<span id="cb97-2"><a href="#cb97-2" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Distribution of days feature"</span>)<span class="op">;</span> <span class="co"># suppresses unneeded plotting output</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="eda_files/figure-html/cell-67-output-1.png" width="447" height="473" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>In terms of data quality, a handful of months have averages based on measurements taken on fewer than half the days. In addition, there are nearly 200 missing values–<strong>that’s about 27% of the data</strong>!</p>
+<p><br></p>
+<p>Finally, let’s check the last time feature, <strong>year</strong> <code>Yr</code>.</p>
+<p>Let’s check to see if there is any connection between missing-ness and the year of the recording.</p>
+<div id="1b7be926" class="cell" data-execution_count="67">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb98"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb98-1"><a href="#cb98-1" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(x<span class="op">=</span><span class="st">"Yr"</span>, y<span class="op">=</span><span class="st">"Days"</span>, data<span class="op">=</span>co2)<span class="op">;</span></span>
+<span id="cb98-2"><a href="#cb98-2" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Day field by Year"</span>)<span class="op">;</span> <span class="co"># the ; suppresses output</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="eda_files/figure-html/cell-68-output-1.png" width="981" height="775" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p><strong>Observations</strong>:</p>
+<ul>
+<li>All of the missing data are in the early years of operation.</li>
+<li>It appears there may have been problems with equipment in the mid to late 80s.</li>
+</ul>
+<p><strong>Potential Next Steps</strong>:</p>
+<ul>
+<li>Confirm these explanations through documentation about the historical readings.</li>
+<li>Maybe drop the earliest recordings? However, we would want to delay such action until after we have examined the time trends and assess whether there are any potential problems.</li>
+</ul>
+<p><br></p>
+</section>
+<section id="understanding-missing-value-2-avg" class="level3" data-number="5.5.6">
+<h3 data-number="5.5.6" class="anchored" data-anchor-id="understanding-missing-value-2-avg"><span class="header-section-number">5.5.6</span> Understanding Missing Value 2: <code>Avg</code></h3>
+<p>Next, let’s return to the -99.99 values in <code>Avg</code> to analyze the overall quality of the CO<sub>2</sub> measurements. We’ll plot a histogram of the average CO<sub>2</sub> measurements</p>
+<div id="d25343c8" class="cell" data-execution_count="68">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb99"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb99-1"><a href="#cb99-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Histograms of average CO2 measurements</span></span>
+<span id="cb99-2"><a href="#cb99-2" aria-hidden="true" tabindex="-1"></a>sns.displot(co2[<span class="st">'Avg'</span>])<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="eda_files/figure-html/cell-69-output-1.png" width="447" height="447" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>The non-missing values are in the 300-400 range (a regular range of CO<sub>2</sub> levels).</p>
+<p>We also see that there are only a few missing <code>Avg</code> values (<strong>&lt;1% of values</strong>). Let’s examine all of them:</p>
+<div id="d5418400" class="cell" data-execution_count="69">
+<div class="sourceCode cell-code" id="cb100"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb100-1"><a href="#cb100-1" aria-hidden="true" tabindex="-1"></a>co2[co2[<span class="st">"Avg"</span>] <span class="op">&lt;</span> <span class="dv">0</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="69">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Yr</th>
+<th data-quarto-table-cell-role="th">Mo</th>
+<th data-quarto-table-cell-role="th">DecDate</th>
+<th data-quarto-table-cell-role="th">Avg</th>
+<th data-quarto-table-cell-role="th">Int</th>
+<th data-quarto-table-cell-role="th">Trend</th>
+<th data-quarto-table-cell-role="th">Days</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1958</td>
+<td>6</td>
+<td>1958.46</td>
+<td>-99.99</td>
+<td>317.10</td>
+<td>314.85</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">7</td>
+<td>1958</td>
+<td>10</td>
+<td>1958.79</td>
+<td>-99.99</td>
+<td>312.66</td>
+<td>315.61</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">71</td>
+<td>1964</td>
+<td>2</td>
+<td>1964.12</td>
+<td>-99.99</td>
+<td>320.07</td>
+<td>319.61</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">72</td>
+<td>1964</td>
+<td>3</td>
+<td>1964.21</td>
+<td>-99.99</td>
+<td>320.73</td>
+<td>319.55</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">73</td>
+<td>1964</td>
+<td>4</td>
+<td>1964.29</td>
+<td>-99.99</td>
+<td>321.77</td>
+<td>319.48</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">213</td>
+<td>1975</td>
+<td>12</td>
+<td>1975.96</td>
+<td>-99.99</td>
+<td>330.59</td>
+<td>331.60</td>
+<td>0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">313</td>
+<td>1984</td>
+<td>4</td>
+<td>1984.29</td>
+<td>-99.99</td>
+<td>346.84</td>
+<td>344.27</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>There doesn’t seem to be a pattern to these values, other than that most records also were missing <code>Days</code> data.</p>
+</section>
+<section id="drop-nan-or-impute-missing-avg-data" class="level3" data-number="5.5.7">
+<h3 data-number="5.5.7" class="anchored" data-anchor-id="drop-nan-or-impute-missing-avg-data"><span class="header-section-number">5.5.7</span> Drop, <code>NaN</code>, or Impute Missing <code>Avg</code> Data?</h3>
+<p>How should we address the invalid <code>Avg</code> data?</p>
+<ol type="1">
+<li>Drop records</li>
+<li>Set to NaN</li>
+<li>Impute using some strategy</li>
+</ol>
+<p>Remember we want to fix the following plot:</p>
+<div id="b160f7bd" class="cell" data-execution_count="70">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb101"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb101-1"><a href="#cb101-1" aria-hidden="true" tabindex="-1"></a>sns.lineplot(x<span class="op">=</span><span class="st">'DecDate'</span>, y<span class="op">=</span><span class="st">'Avg'</span>, data<span class="op">=</span>co2)</span>
+<span id="cb101-2"><a href="#cb101-2" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"CO2 Average By Month"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="eda_files/figure-html/cell-71-output-1.png" width="1006" height="775" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Since we are plotting <code>Avg</code> vs <code>DecDate</code>, we should just focus on dealing with missing values for <code>Avg</code>.</p>
+<p>Let’s consider a few options: 1. Drop those records 2. Replace -99.99 with NaN 3. Substitute it with a likely value for the average CO<sub>2</sub>?</p>
+<p>What do you think are the pros and cons of each possible action?</p>
+<p>Let’s examine each of these three options.</p>
+<div id="ec53d349" class="cell" data-execution_count="71">
+<div class="sourceCode cell-code" id="cb102"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb102-1"><a href="#cb102-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 1. Drop missing values</span></span>
+<span id="cb102-2"><a href="#cb102-2" aria-hidden="true" tabindex="-1"></a>co2_drop <span class="op">=</span> co2[co2[<span class="st">'Avg'</span>] <span class="op">&gt;</span> <span class="dv">0</span>]</span>
+<span id="cb102-3"><a href="#cb102-3" aria-hidden="true" tabindex="-1"></a>co2_drop.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="71">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Yr</th>
+<th data-quarto-table-cell-role="th">Mo</th>
+<th data-quarto-table-cell-role="th">DecDate</th>
+<th data-quarto-table-cell-role="th">Avg</th>
+<th data-quarto-table-cell-role="th">Int</th>
+<th data-quarto-table-cell-role="th">Trend</th>
+<th data-quarto-table-cell-role="th">Days</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1958</td>
+<td>3</td>
+<td>1958.21</td>
+<td>315.71</td>
+<td>315.71</td>
+<td>314.62</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1958</td>
+<td>4</td>
+<td>1958.29</td>
+<td>317.45</td>
+<td>317.45</td>
+<td>315.29</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1958</td>
+<td>5</td>
+<td>1958.38</td>
+<td>317.50</td>
+<td>317.50</td>
+<td>314.71</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1958</td>
+<td>7</td>
+<td>1958.54</td>
+<td>315.86</td>
+<td>315.86</td>
+<td>314.98</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">5</td>
+<td>1958</td>
+<td>8</td>
+<td>1958.62</td>
+<td>314.93</td>
+<td>314.93</td>
+<td>315.94</td>
+<td>-1</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="db35caad" class="cell" data-execution_count="72">
+<div class="sourceCode cell-code" id="cb103"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb103-1"><a href="#cb103-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 2. Replace NaN with -99.99</span></span>
+<span id="cb103-2"><a href="#cb103-2" aria-hidden="true" tabindex="-1"></a>co2_NA <span class="op">=</span> co2.replace(<span class="op">-</span><span class="fl">99.99</span>, np.nan)</span>
+<span id="cb103-3"><a href="#cb103-3" aria-hidden="true" tabindex="-1"></a>co2_NA.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="72">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Yr</th>
+<th data-quarto-table-cell-role="th">Mo</th>
+<th data-quarto-table-cell-role="th">DecDate</th>
+<th data-quarto-table-cell-role="th">Avg</th>
+<th data-quarto-table-cell-role="th">Int</th>
+<th data-quarto-table-cell-role="th">Trend</th>
+<th data-quarto-table-cell-role="th">Days</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1958</td>
+<td>3</td>
+<td>1958.21</td>
+<td>315.71</td>
+<td>315.71</td>
+<td>314.62</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1958</td>
+<td>4</td>
+<td>1958.29</td>
+<td>317.45</td>
+<td>317.45</td>
+<td>315.29</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1958</td>
+<td>5</td>
+<td>1958.38</td>
+<td>317.50</td>
+<td>317.50</td>
+<td>314.71</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1958</td>
+<td>6</td>
+<td>1958.46</td>
+<td>NaN</td>
+<td>317.10</td>
+<td>314.85</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1958</td>
+<td>7</td>
+<td>1958.54</td>
+<td>315.86</td>
+<td>315.86</td>
+<td>314.98</td>
+<td>-1</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We’ll also use a third version of the data.</p>
+<p>First, we note that the dataset already comes with a <strong>substitute value</strong> for the -99.99.</p>
+<p>From the file description:</p>
+<blockquote class="blockquote">
+<p>The <code>interpolated</code> column includes average values from the preceding column (<code>average</code>) and <strong>interpolated values</strong> where data are missing. Interpolated values are computed in two steps…</p>
+</blockquote>
+<p>The <code>Int</code> feature has values that exactly match those in <code>Avg</code>, except when <code>Avg</code> is -99.99, and then a <strong>reasonable</strong> estimate is used instead.</p>
+<p>So, the third version of our data will use the <code>Int</code> feature instead of <code>Avg</code>.</p>
+<div id="89eea285" class="cell" data-execution_count="73">
+<div class="sourceCode cell-code" id="cb104"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb104-1"><a href="#cb104-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 3. Use interpolated column which estimates missing Avg values</span></span>
+<span id="cb104-2"><a href="#cb104-2" aria-hidden="true" tabindex="-1"></a>co2_impute <span class="op">=</span> co2.copy()</span>
+<span id="cb104-3"><a href="#cb104-3" aria-hidden="true" tabindex="-1"></a>co2_impute[<span class="st">'Avg'</span>] <span class="op">=</span> co2[<span class="st">'Int'</span>]</span>
+<span id="cb104-4"><a href="#cb104-4" aria-hidden="true" tabindex="-1"></a>co2_impute.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="73">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Yr</th>
+<th data-quarto-table-cell-role="th">Mo</th>
+<th data-quarto-table-cell-role="th">DecDate</th>
+<th data-quarto-table-cell-role="th">Avg</th>
+<th data-quarto-table-cell-role="th">Int</th>
+<th data-quarto-table-cell-role="th">Trend</th>
+<th data-quarto-table-cell-role="th">Days</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1958</td>
+<td>3</td>
+<td>1958.21</td>
+<td>315.71</td>
+<td>315.71</td>
+<td>314.62</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1958</td>
+<td>4</td>
+<td>1958.29</td>
+<td>317.45</td>
+<td>317.45</td>
+<td>315.29</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1958</td>
+<td>5</td>
+<td>1958.38</td>
+<td>317.50</td>
+<td>317.50</td>
+<td>314.71</td>
+<td>-1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1958</td>
+<td>6</td>
+<td>1958.46</td>
+<td>317.10</td>
+<td>317.10</td>
+<td>314.85</td>
+<td>-1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1958</td>
+<td>7</td>
+<td>1958.54</td>
+<td>315.86</td>
+<td>315.86</td>
+<td>314.98</td>
+<td>-1</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>What’s a <strong>reasonable</strong> estimate?</p>
+<p>To answer this question, let’s zoom in on a short time period, say the measurements in 1958 (where we know we have two missing values).</p>
+<div id="76566765" class="cell" data-execution_count="74">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb105"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb105-1"><a href="#cb105-1" aria-hidden="true" tabindex="-1"></a><span class="co"># results of plotting data in 1958</span></span>
+<span id="cb105-2"><a href="#cb105-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb105-3"><a href="#cb105-3" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> line_and_points(data, ax, title):</span>
+<span id="cb105-4"><a href="#cb105-4" aria-hidden="true" tabindex="-1"></a>    <span class="co"># assumes single year, hence Mo</span></span>
+<span id="cb105-5"><a href="#cb105-5" aria-hidden="true" tabindex="-1"></a>    ax.plot(<span class="st">'Mo'</span>, <span class="st">'Avg'</span>, data<span class="op">=</span>data)</span>
+<span id="cb105-6"><a href="#cb105-6" aria-hidden="true" tabindex="-1"></a>    ax.scatter(<span class="st">'Mo'</span>, <span class="st">'Avg'</span>, data<span class="op">=</span>data)</span>
+<span id="cb105-7"><a href="#cb105-7" aria-hidden="true" tabindex="-1"></a>    ax.set_xlim(<span class="dv">2</span>, <span class="dv">13</span>)</span>
+<span id="cb105-8"><a href="#cb105-8" aria-hidden="true" tabindex="-1"></a>    ax.set_title(title)</span>
+<span id="cb105-9"><a href="#cb105-9" aria-hidden="true" tabindex="-1"></a>    ax.set_xticks(np.arange(<span class="dv">3</span>, <span class="dv">13</span>))</span>
+<span id="cb105-10"><a href="#cb105-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb105-11"><a href="#cb105-11" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> data_year(data, year):</span>
+<span id="cb105-12"><a href="#cb105-12" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> data[data[<span class="st">"Yr"</span>] <span class="op">==</span> <span class="dv">1958</span>]</span>
+<span id="cb105-13"><a href="#cb105-13" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb105-14"><a href="#cb105-14" aria-hidden="true" tabindex="-1"></a><span class="co"># uses matplotlib subplots</span></span>
+<span id="cb105-15"><a href="#cb105-15" aria-hidden="true" tabindex="-1"></a><span class="co"># you may see more next week; focus on output for now</span></span>
+<span id="cb105-16"><a href="#cb105-16" aria-hidden="true" tabindex="-1"></a>fig, axes <span class="op">=</span> plt.subplots(ncols <span class="op">=</span> <span class="dv">3</span>, figsize<span class="op">=</span>(<span class="dv">12</span>, <span class="dv">4</span>), sharey<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb105-17"><a href="#cb105-17" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb105-18"><a href="#cb105-18" aria-hidden="true" tabindex="-1"></a>year <span class="op">=</span> <span class="dv">1958</span></span>
+<span id="cb105-19"><a href="#cb105-19" aria-hidden="true" tabindex="-1"></a>line_and_points(data_year(co2_drop, year), axes[<span class="dv">0</span>], title<span class="op">=</span><span class="st">"1. Drop Missing"</span>)</span>
+<span id="cb105-20"><a href="#cb105-20" aria-hidden="true" tabindex="-1"></a>line_and_points(data_year(co2_NA, year), axes[<span class="dv">1</span>], title<span class="op">=</span><span class="st">"2. Missing Set to NaN"</span>)</span>
+<span id="cb105-21"><a href="#cb105-21" aria-hidden="true" tabindex="-1"></a>line_and_points(data_year(co2_impute, year), axes[<span class="dv">2</span>], title<span class="op">=</span><span class="st">"3. Missing Interpolated"</span>)</span>
+<span id="cb105-22"><a href="#cb105-22" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb105-23"><a href="#cb105-23" aria-hidden="true" tabindex="-1"></a>fig.suptitle(<span class="ss">f"Monthly Averages for </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb105-24"><a href="#cb105-24" aria-hidden="true" tabindex="-1"></a>plt.tight_layout()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="eda_files/figure-html/cell-75-output-1.png" width="1119" height="370" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>In the big picture since there are only 7 <code>Avg</code> values missing (<strong>&lt;1%</strong> of 738 months), any of these approaches would work.</p>
+<p>However there is some appeal to <strong>option C, Imputing</strong>:</p>
+<ul>
+<li>Shows seasonal trends for CO<sub>2</sub></li>
+<li>We are plotting all months in our data as a line plot</li>
+</ul>
+<p>Let’s replot our original figure with option 3:</p>
+<div id="ae58b0fa" class="cell" data-execution_count="75">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb106"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb106-1"><a href="#cb106-1" aria-hidden="true" tabindex="-1"></a>sns.lineplot(x<span class="op">=</span><span class="st">'DecDate'</span>, y<span class="op">=</span><span class="st">'Avg'</span>, data<span class="op">=</span>co2_impute)</span>
+<span id="cb106-2"><a href="#cb106-2" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"CO2 Average By Month, Imputed"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="eda_files/figure-html/cell-76-output-1.png" width="993" height="775" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Looks pretty close to what we see on the NOAA <a href="https://gml.noaa.gov/ccgg/trends/">website</a>!</p>
+</section>
+<section id="presenting-the-data-a-discussion-on-data-granularity" class="level3" data-number="5.5.8">
+<h3 data-number="5.5.8" class="anchored" data-anchor-id="presenting-the-data-a-discussion-on-data-granularity"><span class="header-section-number">5.5.8</span> Presenting the Data: A Discussion on Data Granularity</h3>
+<p>From the description:</p>
+<ul>
+<li>Monthly measurements are averages of average day measurements.</li>
+<li>The NOAA GML website has datasets for daily/hourly measurements too.</li>
+</ul>
+<p>The data you present depends on your research question.</p>
+<p><strong>How do CO<sub>2</sub> levels vary by season?</strong></p>
+<ul>
+<li>You might want to keep average monthly data.</li>
+</ul>
+<p><strong>Are CO<sub>2</sub> levels rising over the past 50+ years, consistent with global warming predictions?</strong></p>
+<ul>
+<li>You might be happier with a <strong>coarser granularity</strong> of average year data!</li>
+</ul>
+<div id="95b691e8" class="cell" data-execution_count="76">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb107"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb107-1"><a href="#cb107-1" aria-hidden="true" tabindex="-1"></a>co2_year <span class="op">=</span> co2_impute.groupby(<span class="st">'Yr'</span>).mean()</span>
+<span id="cb107-2"><a href="#cb107-2" aria-hidden="true" tabindex="-1"></a>sns.lineplot(x<span class="op">=</span><span class="st">'Yr'</span>, y<span class="op">=</span><span class="st">'Avg'</span>, data<span class="op">=</span>co2_year)</span>
+<span id="cb107-3"><a href="#cb107-3" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"CO2 Average By Year"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="eda_files/figure-html/cell-77-output-1.png" width="993" height="775" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Indeed, we see a rise by nearly 100 ppm of CO<sub>2</sub> since Mauna Loa began recording in 1958.</p>
+</section>
+</section>
+<section id="summary" class="level2" data-number="5.6">
+<h2 data-number="5.6" class="anchored" data-anchor-id="summary"><span class="header-section-number">5.6</span> Summary</h2>
+<p>We went over a lot of content this lecture; let’s summarize the most important points:</p>
+<section id="dealing-with-missing-values" class="level3" data-number="5.6.1">
+<h3 data-number="5.6.1" class="anchored" data-anchor-id="dealing-with-missing-values"><span class="header-section-number">5.6.1</span> Dealing with Missing Values</h3>
+<p>There are a few options we can take to deal with missing data:</p>
+<ul>
+<li>Drop missing records</li>
+<li>Keep <code>NaN</code> missing values</li>
+<li>Impute using an interpolated column</li>
+</ul>
+</section>
+<section id="eda-and-data-wrangling" class="level3" data-number="5.6.2">
+<h3 data-number="5.6.2" class="anchored" data-anchor-id="eda-and-data-wrangling"><span class="header-section-number">5.6.2</span> EDA and Data Wrangling</h3>
+<p>There are several ways to approach EDA and Data Wrangling:</p>
+<ul>
+<li>Examine the <strong>data and metadata</strong>: what is the date, size, organization, and structure of the data?</li>
+<li>Examine each <strong>field/attribute/dimension</strong> individually.</li>
+<li>Examine pairs of related dimensions (e.g.&nbsp;breaking down grades by major).</li>
+<li>Along the way, we can:
+<ul>
+<li><strong>Visualize</strong> or summarize the data.</li>
+<li><strong>Validate assumptions</strong> about data and its collection process. Pay particular attention to when the data was collected.</li>
+<li>Identify and <strong>address anomalies</strong>.</li>
+<li>Apply data transformations and corrections (we’ll cover this in the upcoming lecture).</li>
+<li><strong>Record everything you do!</strong> Developing in Jupyter Notebook promotes <em>reproducibility</em> of your own work!</li>
+</ul></li>
+</ul>
+
+
+<!-- -->
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../pandas_3/pandas_3.html" class="pagination-link" aria-label="Pandas III">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../regex/regex.html" class="pagination-link" aria-label="Regular Expressions">
+        <span class="nav-page-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb108" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb108-1"><a href="#cb108-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb108-2"><a href="#cb108-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Data Cleaning and EDA</span></span>
+<span id="cb108-3"><a href="#cb108-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb108-4"><a href="#cb108-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb108-5"><a href="#cb108-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb108-6"><a href="#cb108-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb108-7"><a href="#cb108-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
+<span id="cb108-8"><a href="#cb108-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb108-9"><a href="#cb108-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb108-10"><a href="#cb108-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Data Cleaning and EDA</span></span>
+<span id="cb108-11"><a href="#cb108-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb108-12"><a href="#cb108-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb108-13"><a href="#cb108-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb108-14"><a href="#cb108-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb108-15"><a href="#cb108-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb108-16"><a href="#cb108-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb108-17"><a href="#cb108-17" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb108-18"><a href="#cb108-18" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb108-19"><a href="#cb108-19" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb108-20"><a href="#cb108-20" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb108-21"><a href="#cb108-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb108-22"><a href="#cb108-22" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb108-23"><a href="#cb108-23" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb108-24"><a href="#cb108-24" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb108-25"><a href="#cb108-25" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb108-26"><a href="#cb108-26" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb108-27"><a href="#cb108-27" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb108-28"><a href="#cb108-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-31"><a href="#cb108-31" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-32"><a href="#cb108-32" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-33"><a href="#cb108-33" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb108-34"><a href="#cb108-34" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb108-35"><a href="#cb108-35" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-36"><a href="#cb108-36" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb108-37"><a href="#cb108-37" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb108-38"><a href="#cb108-38" aria-hidden="true" tabindex="-1"></a><span class="co">#%matplotlib inline</span></span>
+<span id="cb108-39"><a href="#cb108-39" aria-hidden="true" tabindex="-1"></a>plt.rcParams[<span class="st">'figure.figsize'</span>] <span class="op">=</span> (<span class="dv">12</span>, <span class="dv">9</span>)</span>
+<span id="cb108-40"><a href="#cb108-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-41"><a href="#cb108-41" aria-hidden="true" tabindex="-1"></a>sns.<span class="bu">set</span>()</span>
+<span id="cb108-42"><a href="#cb108-42" aria-hidden="true" tabindex="-1"></a>sns.set_context(<span class="st">'talk'</span>)</span>
+<span id="cb108-43"><a href="#cb108-43" aria-hidden="true" tabindex="-1"></a>np.set_printoptions(threshold<span class="op">=</span><span class="dv">20</span>, precision<span class="op">=</span><span class="dv">2</span>, suppress<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb108-44"><a href="#cb108-44" aria-hidden="true" tabindex="-1"></a>pd.set_option(<span class="st">'display.max_rows'</span>, <span class="dv">30</span>)</span>
+<span id="cb108-45"><a href="#cb108-45" aria-hidden="true" tabindex="-1"></a>pd.set_option(<span class="st">'display.max_columns'</span>, <span class="va">None</span>)</span>
+<span id="cb108-46"><a href="#cb108-46" aria-hidden="true" tabindex="-1"></a>pd.set_option(<span class="st">'display.precision'</span>, <span class="dv">2</span>)</span>
+<span id="cb108-47"><a href="#cb108-47" aria-hidden="true" tabindex="-1"></a><span class="co"># This option stops scientific notation for pandas</span></span>
+<span id="cb108-48"><a href="#cb108-48" aria-hidden="true" tabindex="-1"></a>pd.set_option(<span class="st">'display.float_format'</span>, <span class="st">'</span><span class="sc">{:.2f}</span><span class="st">'</span>.<span class="bu">format</span>)</span>
+<span id="cb108-49"><a href="#cb108-49" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-50"><a href="#cb108-50" aria-hidden="true" tabindex="-1"></a><span class="co"># Silence some spurious seaborn warnings</span></span>
+<span id="cb108-51"><a href="#cb108-51" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> warnings</span>
+<span id="cb108-52"><a href="#cb108-52" aria-hidden="true" tabindex="-1"></a>warnings.filterwarnings(<span class="st">"ignore"</span>, category<span class="op">=</span><span class="pp">FutureWarning</span>)</span>
+<span id="cb108-53"><a href="#cb108-53" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-54"><a href="#cb108-54" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-55"><a href="#cb108-55" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb108-56"><a href="#cb108-56" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb108-57"><a href="#cb108-57" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Recognize common file formats</span>
+<span id="cb108-58"><a href="#cb108-58" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Categorize data by its variable type</span>
+<span id="cb108-59"><a href="#cb108-59" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Build awareness of issues with data faithfulness and develop targeted solutions</span>
+<span id="cb108-60"><a href="#cb108-60" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb108-61"><a href="#cb108-61" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-62"><a href="#cb108-62" aria-hidden="true" tabindex="-1"></a>In the past few lectures, we've learned that <span class="in">`pandas`</span> is a toolkit to restructure, modify, and explore a dataset. What we haven't yet touched on is *how* to make these data transformation decisions. When we receive a new set of data from the "real world," how do we know what processing we should do to convert this data into a usable form?</span>
+<span id="cb108-63"><a href="#cb108-63" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-64"><a href="#cb108-64" aria-hidden="true" tabindex="-1"></a>**Data cleaning**, also called **data wrangling**, is the process of transforming raw data to facilitate subsequent analysis. It is often used to address issues like:</span>
+<span id="cb108-65"><a href="#cb108-65" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-66"><a href="#cb108-66" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Unclear structure or formatting</span>
+<span id="cb108-67"><a href="#cb108-67" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Missing or corrupted values</span>
+<span id="cb108-68"><a href="#cb108-68" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Unit conversions</span>
+<span id="cb108-69"><a href="#cb108-69" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>...and so on</span>
+<span id="cb108-70"><a href="#cb108-70" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-71"><a href="#cb108-71" aria-hidden="true" tabindex="-1"></a>**Exploratory Data Analysis (EDA)** is the process of understanding a new dataset. It is an open-ended, informal analysis that involves familiarizing ourselves with the variables present in the data, discovering potential hypotheses, and identifying possible issues with the data. This last point can often motivate further data cleaning to address any problems with the dataset's format; because of this, EDA and data cleaning are often thought of as an "infinite loop," with each process driving the other.</span>
+<span id="cb108-72"><a href="#cb108-72" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-73"><a href="#cb108-73" aria-hidden="true" tabindex="-1"></a>In this lecture, we will consider the key properties of data to consider when performing data cleaning and EDA. In doing so, we'll develop a "checklist" of sorts for you to consider when approaching a new dataset. Throughout this process, we'll build a deeper understanding of this early (but very important!) stage of the data science lifecycle.</span>
+<span id="cb108-74"><a href="#cb108-74" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-75"><a href="#cb108-75" aria-hidden="true" tabindex="-1"></a><span class="fu">## Structure</span></span>
+<span id="cb108-76"><a href="#cb108-76" aria-hidden="true" tabindex="-1"></a>We often prefer rectangular data for data analysis. Rectangular structures are easy to manipulate and analyze. A key element of data cleaning is about transforming data to be more rectangular. </span>
+<span id="cb108-77"><a href="#cb108-77" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-78"><a href="#cb108-78" aria-hidden="true" tabindex="-1"></a>There are two kinds of rectangular data: tables and matrices. Tables have named columns with different data types and are manipulated using data transformation languages. Matrices contain numeric data of the same type and are manipulated using linear algebra.</span>
+<span id="cb108-79"><a href="#cb108-79" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-80"><a href="#cb108-80" aria-hidden="true" tabindex="-1"></a><span class="fu">### File Formats</span></span>
+<span id="cb108-81"><a href="#cb108-81" aria-hidden="true" tabindex="-1"></a>There are many file types for storing structured data: TSV, JSON, XML, ASCII, SAS, etc. We'll only cover CSV, TSV, and JSON in lecture, but you'll likely encounter other formats as you work with different datasets. Reading documentation is your best bet for understanding how to process the multitude of different file types. </span>
+<span id="cb108-82"><a href="#cb108-82" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-83"><a href="#cb108-83" aria-hidden="true" tabindex="-1"></a><span class="fu">#### CSV</span></span>
+<span id="cb108-84"><a href="#cb108-84" aria-hidden="true" tabindex="-1"></a>CSVs, which stand for **Comma-Separated Values**, are a common tabular data format. </span>
+<span id="cb108-85"><a href="#cb108-85" aria-hidden="true" tabindex="-1"></a>In the past two <span class="in">`pandas`</span> lectures, we briefly touched on the idea of file format: the way data is encoded in a file for storage. Specifically, our <span class="in">`elections`</span> and <span class="in">`babynames`</span> datasets were stored and loaded as CSVs:</span>
+<span id="cb108-86"><a href="#cb108-86" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-89"><a href="#cb108-89" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-90"><a href="#cb108-90" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-91"><a href="#cb108-91" aria-hidden="true" tabindex="-1"></a>pd.read_csv(<span class="st">"data/elections.csv"</span>).head(<span class="dv">5</span>)</span>
+<span id="cb108-92"><a href="#cb108-92" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-93"><a href="#cb108-93" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-94"><a href="#cb108-94" aria-hidden="true" tabindex="-1"></a>To better understand the properties of a CSV, let's take a look at the first few rows of the raw data file to see what it looks like before being loaded into a <span class="in">`DataFrame`</span>. We'll use the <span class="in">`repr()`</span> function to return the raw string with its special characters: </span>
+<span id="cb108-95"><a href="#cb108-95" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-98"><a href="#cb108-98" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-99"><a href="#cb108-99" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-100"><a href="#cb108-100" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/elections.csv"</span>, <span class="st">"r"</span>) <span class="im">as</span> table:</span>
+<span id="cb108-101"><a href="#cb108-101" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb108-102"><a href="#cb108-102" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> table:</span>
+<span id="cb108-103"><a href="#cb108-103" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="bu">repr</span>(row))</span>
+<span id="cb108-104"><a href="#cb108-104" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb108-105"><a href="#cb108-105" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">3</span>:</span>
+<span id="cb108-106"><a href="#cb108-106" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span>
+<span id="cb108-107"><a href="#cb108-107" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-108"><a href="#cb108-108" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-109"><a href="#cb108-109" aria-hidden="true" tabindex="-1"></a>Each row, or **record**, in the data is delimited by a newline `\n`. Each column, or **field**, in the data is delimited by a comma <span class="in">`,`</span> (hence, comma-separated!). </span>
+<span id="cb108-110"><a href="#cb108-110" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-111"><a href="#cb108-111" aria-hidden="true" tabindex="-1"></a><span class="fu">#### TSV</span></span>
+<span id="cb108-112"><a href="#cb108-112" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-113"><a href="#cb108-113" aria-hidden="true" tabindex="-1"></a>Another common file type is **TSV (Tab-Separated Values)**. In a TSV, records are still delimited by a newline <span class="in">`\n`</span>, while fields are delimited by <span class="in">`\t`</span> tab character. </span>
+<span id="cb108-114"><a href="#cb108-114" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-115"><a href="#cb108-115" aria-hidden="true" tabindex="-1"></a>Let's check out the first few rows of the raw TSV file. Again, we'll use the <span class="in">`repr()`</span> function so that <span class="in">`print`</span> shows the special characters.</span>
+<span id="cb108-116"><a href="#cb108-116" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-119"><a href="#cb108-119" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-120"><a href="#cb108-120" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-121"><a href="#cb108-121" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/elections.txt"</span>, <span class="st">"r"</span>) <span class="im">as</span> table:</span>
+<span id="cb108-122"><a href="#cb108-122" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb108-123"><a href="#cb108-123" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> table:</span>
+<span id="cb108-124"><a href="#cb108-124" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="bu">repr</span>(row))</span>
+<span id="cb108-125"><a href="#cb108-125" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb108-126"><a href="#cb108-126" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">3</span>:</span>
+<span id="cb108-127"><a href="#cb108-127" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span>
+<span id="cb108-128"><a href="#cb108-128" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-129"><a href="#cb108-129" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-130"><a href="#cb108-130" aria-hidden="true" tabindex="-1"></a>TSVs can be loaded into <span class="in">`pandas`</span> using <span class="in">`pd.read_csv`</span>. We'll need to specify the **delimiter** with parameter<span class="in">` sep='\t'`</span> <span class="co">[</span><span class="ot">(documentation)</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)</span>.</span>
+<span id="cb108-131"><a href="#cb108-131" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-134"><a href="#cb108-134" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-135"><a href="#cb108-135" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-136"><a href="#cb108-136" aria-hidden="true" tabindex="-1"></a>pd.read_csv(<span class="st">"data/elections.txt"</span>, sep<span class="op">=</span><span class="st">'</span><span class="ch">\t</span><span class="st">'</span>).head(<span class="dv">3</span>)</span>
+<span id="cb108-137"><a href="#cb108-137" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-138"><a href="#cb108-138" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-139"><a href="#cb108-139" aria-hidden="true" tabindex="-1"></a>An issue with CSVs and TSVs comes up whenever there are commas or tabs within the records. How does <span class="in">`pandas`</span> differentiate between a comma delimiter vs. a comma within the field itself, for example <span class="in">`8,900`</span>? To remedy this, check out the <span class="co">[</span><span class="ot">`quotechar` parameter</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)</span>. </span>
+<span id="cb108-140"><a href="#cb108-140" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-141"><a href="#cb108-141" aria-hidden="true" tabindex="-1"></a><span class="fu">#### JSON</span></span>
+<span id="cb108-142"><a href="#cb108-142" aria-hidden="true" tabindex="-1"></a>**JSON (JavaScript Object Notation)** files behave similarly to Python dictionaries. A raw JSON is shown below.</span>
+<span id="cb108-143"><a href="#cb108-143" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-146"><a href="#cb108-146" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-147"><a href="#cb108-147" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-148"><a href="#cb108-148" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/elections.json"</span>, <span class="st">"r"</span>) <span class="im">as</span> table:</span>
+<span id="cb108-149"><a href="#cb108-149" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb108-150"><a href="#cb108-150" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> table:</span>
+<span id="cb108-151"><a href="#cb108-151" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(row)</span>
+<span id="cb108-152"><a href="#cb108-152" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb108-153"><a href="#cb108-153" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">8</span>:</span>
+<span id="cb108-154"><a href="#cb108-154" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span>
+<span id="cb108-155"><a href="#cb108-155" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-156"><a href="#cb108-156" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-157"><a href="#cb108-157" aria-hidden="true" tabindex="-1"></a>JSON files can be loaded into <span class="in">`pandas`</span> using <span class="in">`pd.read_json`</span>. </span>
+<span id="cb108-158"><a href="#cb108-158" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-161"><a href="#cb108-161" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-162"><a href="#cb108-162" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-163"><a href="#cb108-163" aria-hidden="true" tabindex="-1"></a>pd.read_json(<span class="st">'data/elections.json'</span>).head(<span class="dv">3</span>)</span>
+<span id="cb108-164"><a href="#cb108-164" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-165"><a href="#cb108-165" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-166"><a href="#cb108-166" aria-hidden="true" tabindex="-1"></a><span class="fu">##### EDA with JSON: Berkeley COVID-19 Data</span></span>
+<span id="cb108-167"><a href="#cb108-167" aria-hidden="true" tabindex="-1"></a>The City of Berkeley Open Data <span class="co">[</span><span class="ot">website</span><span class="co">](https://data.cityofberkeley.info/Health/COVID-19-Confirmed-Cases/xn6j-b766)</span> has a dataset with COVID-19 Confirmed Cases among Berkeley residents by date. Let's download the file and save it as a JSON (note the source URL file type is also a JSON). In the interest of reproducible data science, we will download the data programatically. We have defined some helper functions in the <span class="co">[</span><span class="ot">`ds100_utils.py`</span><span class="co">](https://ds100.org/fa23/resources/assets/lectures/lec05/lec05-eda.html)</span> file that we can reuse these helper functions in many different notebooks.</span>
+<span id="cb108-168"><a href="#cb108-168" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-171"><a href="#cb108-171" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-172"><a href="#cb108-172" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-173"><a href="#cb108-173" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> ds100_utils <span class="im">import</span> fetch_and_cache</span>
+<span id="cb108-174"><a href="#cb108-174" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-175"><a href="#cb108-175" aria-hidden="true" tabindex="-1"></a>covid_file <span class="op">=</span> fetch_and_cache(</span>
+<span id="cb108-176"><a href="#cb108-176" aria-hidden="true" tabindex="-1"></a>    <span class="st">"https://data.cityofberkeley.info/api/views/xn6j-b766/rows.json?accessType=DOWNLOAD"</span>,</span>
+<span id="cb108-177"><a href="#cb108-177" aria-hidden="true" tabindex="-1"></a>    <span class="st">"confirmed-cases.json"</span>,</span>
+<span id="cb108-178"><a href="#cb108-178" aria-hidden="true" tabindex="-1"></a>    force<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb108-179"><a href="#cb108-179" aria-hidden="true" tabindex="-1"></a>covid_file          <span class="co"># a file path wrapper object</span></span>
+<span id="cb108-180"><a href="#cb108-180" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-181"><a href="#cb108-181" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-182"><a href="#cb108-182" aria-hidden="true" tabindex="-1"></a><span class="fu">###### File Size</span></span>
+<span id="cb108-183"><a href="#cb108-183" aria-hidden="true" tabindex="-1"></a>Let's start our analysis by getting a rough estimate of the size of the dataset to inform the tools we use to view the data. For relatively small datasets, we can use a text editor or spreadsheet. For larger datasets, more programmatic exploration or distributed computing tools may be more fitting. Here we will use <span class="in">`Python`</span> tools to probe the file.</span>
+<span id="cb108-184"><a href="#cb108-184" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-185"><a href="#cb108-185" aria-hidden="true" tabindex="-1"></a>Since there seem to be text files, let's investigate the number of lines, which often corresponds to the number of records</span>
+<span id="cb108-186"><a href="#cb108-186" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-189"><a href="#cb108-189" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-190"><a href="#cb108-190" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-191"><a href="#cb108-191" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
+<span id="cb108-192"><a href="#cb108-192" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-193"><a href="#cb108-193" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(covid_file, <span class="st">"is"</span>, os.path.getsize(covid_file) <span class="op">/</span> <span class="fl">1e6</span>, <span class="st">"MB"</span>)</span>
+<span id="cb108-194"><a href="#cb108-194" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-195"><a href="#cb108-195" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(covid_file, <span class="st">"r"</span>) <span class="im">as</span> f:</span>
+<span id="cb108-196"><a href="#cb108-196" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(covid_file, <span class="st">"is"</span>, <span class="bu">sum</span>(<span class="dv">1</span> <span class="cf">for</span> l <span class="kw">in</span> f), <span class="st">"lines."</span>)</span>
+<span id="cb108-197"><a href="#cb108-197" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-198"><a href="#cb108-198" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-199"><a href="#cb108-199" aria-hidden="true" tabindex="-1"></a><span class="fu">###### Unix Commands</span></span>
+<span id="cb108-200"><a href="#cb108-200" aria-hidden="true" tabindex="-1"></a>As part of the EDA workflow, Unix commands can come in very handy. In fact, there's an entire book called <span class="co">[</span><span class="ot">"Data Science at the Command Line"</span><span class="co">](https://datascienceatthecommandline.com/)</span> that explores this idea in depth! </span>
+<span id="cb108-201"><a href="#cb108-201" aria-hidden="true" tabindex="-1"></a>In Jupyter/IPython, you can prefix lines with <span class="in">`!`</span> to execute arbitrary Unix commands, and within those lines, you can refer to Python variables and expressions with the syntax <span class="in">`{expr}`</span>.</span>
+<span id="cb108-202"><a href="#cb108-202" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-203"><a href="#cb108-203" aria-hidden="true" tabindex="-1"></a>Here, we use the <span class="in">`ls`</span> command to list files, using the <span class="in">`-lh`</span> flags, which request "long format with information in human-readable form." We also use the <span class="in">`wc`</span> command for "word count," but with the <span class="in">`-l`</span> flag, which asks for line counts instead of words.</span>
+<span id="cb108-204"><a href="#cb108-204" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-205"><a href="#cb108-205" aria-hidden="true" tabindex="-1"></a>These two give us the same information as the code above, albeit in a slightly different form:</span>
+<span id="cb108-206"><a href="#cb108-206" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-209"><a href="#cb108-209" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-210"><a href="#cb108-210" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-211"><a href="#cb108-211" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh {covid_file}</span>
+<span id="cb108-212"><a href="#cb108-212" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>wc <span class="op">-</span>l {covid_file}</span>
+<span id="cb108-213"><a href="#cb108-213" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-214"><a href="#cb108-214" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-215"><a href="#cb108-215" aria-hidden="true" tabindex="-1"></a><span class="fu">###### File Contents</span></span>
+<span id="cb108-216"><a href="#cb108-216" aria-hidden="true" tabindex="-1"></a>Let's explore the data format using <span class="in">`Python`</span>. </span>
+<span id="cb108-217"><a href="#cb108-217" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-220"><a href="#cb108-220" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-221"><a href="#cb108-221" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-222"><a href="#cb108-222" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(covid_file, <span class="st">"r"</span>) <span class="im">as</span> f:</span>
+<span id="cb108-223"><a href="#cb108-223" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> i, row <span class="kw">in</span> <span class="bu">enumerate</span>(f):</span>
+<span id="cb108-224"><a href="#cb108-224" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="bu">repr</span>(row)) <span class="co"># print raw strings</span></span>
+<span id="cb108-225"><a href="#cb108-225" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;=</span> <span class="dv">4</span>: <span class="cf">break</span></span>
+<span id="cb108-226"><a href="#cb108-226" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-227"><a href="#cb108-227" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-228"><a href="#cb108-228" aria-hidden="true" tabindex="-1"></a>We can use the <span class="in">`head`</span> Unix command (which is where <span class="in">`pandas`</span>' <span class="in">`head`</span> method comes from!) to see the first few lines of the file:</span>
+<span id="cb108-229"><a href="#cb108-229" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-232"><a href="#cb108-232" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-233"><a href="#cb108-233" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-234"><a href="#cb108-234" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>head <span class="op">-</span><span class="dv">5</span> {covid_file}</span>
+<span id="cb108-235"><a href="#cb108-235" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-236"><a href="#cb108-236" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-237"><a href="#cb108-237" aria-hidden="true" tabindex="-1"></a>In order to load the JSON file into <span class="in">`pandas`</span>, Let's first do some EDA with Oython's <span class="in">`json`</span> package to understand the particular structure of this JSON file so that we can decide what (if anything) to load into <span class="in">`pandas`</span>. Python has relatively good support for JSON data since it closely matches the internal python object model. In the following cell we import the entire JSON datafile into a python dictionary using the <span class="in">`json`</span> package.</span>
+<span id="cb108-238"><a href="#cb108-238" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-241"><a href="#cb108-241" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-242"><a href="#cb108-242" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-243"><a href="#cb108-243" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> json</span>
+<span id="cb108-244"><a href="#cb108-244" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-245"><a href="#cb108-245" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(covid_file, <span class="st">"rb"</span>) <span class="im">as</span> f:</span>
+<span id="cb108-246"><a href="#cb108-246" aria-hidden="true" tabindex="-1"></a>    covid_json <span class="op">=</span> json.load(f)</span>
+<span id="cb108-247"><a href="#cb108-247" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-248"><a href="#cb108-248" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-249"><a href="#cb108-249" aria-hidden="true" tabindex="-1"></a>The <span class="in">`covid_json`</span> variable is now a dictionary encoding the data in the file:</span>
+<span id="cb108-250"><a href="#cb108-250" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-253"><a href="#cb108-253" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-254"><a href="#cb108-254" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-255"><a href="#cb108-255" aria-hidden="true" tabindex="-1"></a><span class="bu">type</span>(covid_json)</span>
+<span id="cb108-256"><a href="#cb108-256" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-257"><a href="#cb108-257" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-258"><a href="#cb108-258" aria-hidden="true" tabindex="-1"></a>We can examine what keys are in the top level JSON object by listing out the keys. </span>
+<span id="cb108-259"><a href="#cb108-259" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-262"><a href="#cb108-262" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-263"><a href="#cb108-263" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-264"><a href="#cb108-264" aria-hidden="true" tabindex="-1"></a>covid_json.keys()</span>
+<span id="cb108-265"><a href="#cb108-265" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-266"><a href="#cb108-266" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-267"><a href="#cb108-267" aria-hidden="true" tabindex="-1"></a>**Observation**: The JSON dictionary contains a <span class="in">`meta`</span> key which likely refers to metadata (data about the data).  Metadata is often maintained with the data and can be a good source of additional information.</span>
+<span id="cb108-268"><a href="#cb108-268" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-269"><a href="#cb108-269" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-270"><a href="#cb108-270" aria-hidden="true" tabindex="-1"></a>We can investigate the metadata further by examining the keys associated with the metadata.</span>
+<span id="cb108-271"><a href="#cb108-271" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-274"><a href="#cb108-274" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-275"><a href="#cb108-275" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-276"><a href="#cb108-276" aria-hidden="true" tabindex="-1"></a>covid_json[<span class="st">'meta'</span>].keys()</span>
+<span id="cb108-277"><a href="#cb108-277" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-278"><a href="#cb108-278" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-279"><a href="#cb108-279" aria-hidden="true" tabindex="-1"></a>The <span class="in">`meta`</span> key contains another dictionary called <span class="in">`view`</span>.  This likely refers to metadata about a particular "view" of some underlying database. We will learn more about views when we study SQL later in the class.    </span>
+<span id="cb108-280"><a href="#cb108-280" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-283"><a href="#cb108-283" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-284"><a href="#cb108-284" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-285"><a href="#cb108-285" aria-hidden="true" tabindex="-1"></a>covid_json[<span class="st">'meta'</span>][<span class="st">'view'</span>].keys()</span>
+<span id="cb108-286"><a href="#cb108-286" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-287"><a href="#cb108-287" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-288"><a href="#cb108-288" aria-hidden="true" tabindex="-1"></a>Notice that this a nested/recursive data structure.  As we dig deeper we reveal more and more keys and the corresponding data:</span>
+<span id="cb108-289"><a href="#cb108-289" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-290"><a href="#cb108-290" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-291"><a href="#cb108-291" aria-hidden="true" tabindex="-1"></a><span class="in">meta</span></span>
+<span id="cb108-292"><a href="#cb108-292" aria-hidden="true" tabindex="-1"></a><span class="in">|-&gt; data</span></span>
+<span id="cb108-293"><a href="#cb108-293" aria-hidden="true" tabindex="-1"></a><span class="in">    | ... (haven't explored yet)</span></span>
+<span id="cb108-294"><a href="#cb108-294" aria-hidden="true" tabindex="-1"></a><span class="in">|-&gt; view</span></span>
+<span id="cb108-295"><a href="#cb108-295" aria-hidden="true" tabindex="-1"></a><span class="in">    | -&gt; id</span></span>
+<span id="cb108-296"><a href="#cb108-296" aria-hidden="true" tabindex="-1"></a><span class="in">    | -&gt; name</span></span>
+<span id="cb108-297"><a href="#cb108-297" aria-hidden="true" tabindex="-1"></a><span class="in">    | -&gt; attribution </span></span>
+<span id="cb108-298"><a href="#cb108-298" aria-hidden="true" tabindex="-1"></a><span class="in">    ...</span></span>
+<span id="cb108-299"><a href="#cb108-299" aria-hidden="true" tabindex="-1"></a><span class="in">    | -&gt; description</span></span>
+<span id="cb108-300"><a href="#cb108-300" aria-hidden="true" tabindex="-1"></a><span class="in">    ...</span></span>
+<span id="cb108-301"><a href="#cb108-301" aria-hidden="true" tabindex="-1"></a><span class="in">    | -&gt; columns</span></span>
+<span id="cb108-302"><a href="#cb108-302" aria-hidden="true" tabindex="-1"></a><span class="in">    ...</span></span>
+<span id="cb108-303"><a href="#cb108-303" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-304"><a href="#cb108-304" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-305"><a href="#cb108-305" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-306"><a href="#cb108-306" aria-hidden="true" tabindex="-1"></a>There is a key called description in the view sub dictionary.  This likely contains a description of the data:</span>
+<span id="cb108-307"><a href="#cb108-307" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-310"><a href="#cb108-310" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-311"><a href="#cb108-311" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-312"><a href="#cb108-312" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(covid_json[<span class="st">'meta'</span>][<span class="st">'view'</span>][<span class="st">'description'</span>])</span>
+<span id="cb108-313"><a href="#cb108-313" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-314"><a href="#cb108-314" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-315"><a href="#cb108-315" aria-hidden="true" tabindex="-1"></a><span class="fu">###### Examining the Data Field for Records</span></span>
+<span id="cb108-316"><a href="#cb108-316" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-317"><a href="#cb108-317" aria-hidden="true" tabindex="-1"></a>We can look at a few entries in the <span class="in">`data`</span> field. This is what we'll load into <span class="in">`pandas`</span>.</span>
+<span id="cb108-318"><a href="#cb108-318" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-321"><a href="#cb108-321" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-322"><a href="#cb108-322" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-323"><a href="#cb108-323" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">3</span>):</span>
+<span id="cb108-324"><a href="#cb108-324" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f"</span><span class="sc">{</span>i<span class="sc">:03}</span><span class="ss"> | </span><span class="sc">{</span>covid_json[<span class="st">'data'</span>][i]<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb108-325"><a href="#cb108-325" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-326"><a href="#cb108-326" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-327"><a href="#cb108-327" aria-hidden="true" tabindex="-1"></a>Observations:</span>
+<span id="cb108-328"><a href="#cb108-328" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>These look like equal-length records, so maybe <span class="in">`data`</span> is a table!</span>
+<span id="cb108-329"><a href="#cb108-329" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>But what do each of values in the record mean? Where can we find column headers?</span>
+<span id="cb108-330"><a href="#cb108-330" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-331"><a href="#cb108-331" aria-hidden="true" tabindex="-1"></a>For that, we'll need the <span class="in">`columns`</span> key in the metadata dictionary. This returns a list: </span>
+<span id="cb108-332"><a href="#cb108-332" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-335"><a href="#cb108-335" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-336"><a href="#cb108-336" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-337"><a href="#cb108-337" aria-hidden="true" tabindex="-1"></a><span class="bu">type</span>(covid_json[<span class="st">'meta'</span>][<span class="st">'view'</span>][<span class="st">'columns'</span>])</span>
+<span id="cb108-338"><a href="#cb108-338" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-339"><a href="#cb108-339" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-340"><a href="#cb108-340" aria-hidden="true" tabindex="-1"></a><span class="fu">###### Summary of exploring the JSON file</span></span>
+<span id="cb108-341"><a href="#cb108-341" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-342"><a href="#cb108-342" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>The above **metadata** tells us a lot about the columns in the data including column names, potential data anomalies, and a basic statistic. </span>
+<span id="cb108-343"><a href="#cb108-343" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Because of its non-tabular structure, JSON makes it easier (than CSV) to create **self-documenting data**, meaning that information about the data is stored in the same file as the data.</span>
+<span id="cb108-344"><a href="#cb108-344" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Self-documenting data can be helpful since it maintains its own description and these descriptions are more likely to be updated as data changes. </span>
+<span id="cb108-345"><a href="#cb108-345" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-346"><a href="#cb108-346" aria-hidden="true" tabindex="-1"></a><span class="fu">###### Loading COVID Data into `pandas`</span></span>
+<span id="cb108-347"><a href="#cb108-347" aria-hidden="true" tabindex="-1"></a>Finally, let's load the data (not the metadata) into a <span class="in">`pandas`</span> <span class="in">`DataFrame`</span>. In the following block of code we:</span>
+<span id="cb108-348"><a href="#cb108-348" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-349"><a href="#cb108-349" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Translate the JSON records into a <span class="in">`DataFrame`</span>:</span>
+<span id="cb108-350"><a href="#cb108-350" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-351"><a href="#cb108-351" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>fields: <span class="in">`covid_json['meta']['view']['columns']`</span></span>
+<span id="cb108-352"><a href="#cb108-352" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>records: <span class="in">`covid_json['data']`</span></span>
+<span id="cb108-353"><a href="#cb108-353" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-354"><a href="#cb108-354" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb108-355"><a href="#cb108-355" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Remove columns that have no metadata description.  This would be a bad idea in general, but here we remove these columns since the above analysis suggests they are unlikely to contain useful information.</span>
+<span id="cb108-356"><a href="#cb108-356" aria-hidden="true" tabindex="-1"></a>   </span>
+<span id="cb108-357"><a href="#cb108-357" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Examine the <span class="in">`tail`</span> of the table.</span>
+<span id="cb108-358"><a href="#cb108-358" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-361"><a href="#cb108-361" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-362"><a href="#cb108-362" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-363"><a href="#cb108-363" aria-hidden="true" tabindex="-1"></a><span class="co"># Load the data from JSON and assign column titles</span></span>
+<span id="cb108-364"><a href="#cb108-364" aria-hidden="true" tabindex="-1"></a>covid <span class="op">=</span> pd.DataFrame(</span>
+<span id="cb108-365"><a href="#cb108-365" aria-hidden="true" tabindex="-1"></a>    covid_json[<span class="st">'data'</span>],</span>
+<span id="cb108-366"><a href="#cb108-366" aria-hidden="true" tabindex="-1"></a>    columns<span class="op">=</span>[c[<span class="st">'name'</span>] <span class="cf">for</span> c <span class="kw">in</span> covid_json[<span class="st">'meta'</span>][<span class="st">'view'</span>][<span class="st">'columns'</span>]])</span>
+<span id="cb108-367"><a href="#cb108-367" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-368"><a href="#cb108-368" aria-hidden="true" tabindex="-1"></a>covid.tail()</span>
+<span id="cb108-369"><a href="#cb108-369" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-370"><a href="#cb108-370" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-371"><a href="#cb108-371" aria-hidden="true" tabindex="-1"></a><span class="fu">### Primary and Foreign Keys</span></span>
+<span id="cb108-372"><a href="#cb108-372" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-373"><a href="#cb108-373" aria-hidden="true" tabindex="-1"></a>Last time, we introduced <span class="in">`.merge`</span> as the <span class="in">`pandas`</span> method for joining multiple <span class="in">`DataFrame`</span>s together. In our discussion of joins, we touched on the idea of using a "key" to determine what rows should be merged from each table. Let's take a moment to examine this idea more closely.</span>
+<span id="cb108-374"><a href="#cb108-374" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-375"><a href="#cb108-375" aria-hidden="true" tabindex="-1"></a>The **primary key** is the column or set of columns in a table that *uniquely* determine the values of the remaining columns. It can be thought of as the unique identifier for each individual row in the table. For example, a table of Data 100 students might use each student's Cal ID as the primary key. </span>
+<span id="cb108-376"><a href="#cb108-376" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-379"><a href="#cb108-379" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-380"><a href="#cb108-380" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: false</span></span>
+<span id="cb108-381"><a href="#cb108-381" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"Cal ID"</span>:[<span class="dv">3034619471</span>, <span class="dv">3035619472</span>, <span class="dv">3025619473</span>, <span class="dv">3046789372</span>], <span class="op">\</span></span>
+<span id="cb108-382"><a href="#cb108-382" aria-hidden="true" tabindex="-1"></a>             <span class="st">"Name"</span>:[<span class="st">"Oski"</span>, <span class="st">"Ollie"</span>, <span class="st">"Orrie"</span>, <span class="st">"Ollie"</span>], <span class="op">\</span></span>
+<span id="cb108-383"><a href="#cb108-383" aria-hidden="true" tabindex="-1"></a>             <span class="st">"Major"</span>:[<span class="st">"Data Science"</span>, <span class="st">"Computer Science"</span>, <span class="st">"Data Science"</span>, <span class="st">"Economics"</span>]})</span>
+<span id="cb108-384"><a href="#cb108-384" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-385"><a href="#cb108-385" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-386"><a href="#cb108-386" aria-hidden="true" tabindex="-1"></a>The **foreign key** is the column or set of columns in a table that reference primary keys in other tables. Knowing a dataset's foreign keys can be useful when assigning the <span class="in">`left_on`</span> and <span class="in">`right_on`</span> parameters of <span class="in">`.merge`</span>. In the table of office hour tickets below, <span class="in">`"Cal ID"`</span> is a foreign key referencing the previous table.</span>
+<span id="cb108-387"><a href="#cb108-387" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-390"><a href="#cb108-390" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-391"><a href="#cb108-391" aria-hidden="true" tabindex="-1"></a><span class="co">#| echo: false</span></span>
+<span id="cb108-392"><a href="#cb108-392" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"OH Request"</span>:[<span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>, <span class="dv">4</span>], <span class="op">\</span></span>
+<span id="cb108-393"><a href="#cb108-393" aria-hidden="true" tabindex="-1"></a>             <span class="st">"Cal ID"</span>:[<span class="dv">3034619471</span>, <span class="dv">3035619472</span>, <span class="dv">3025619473</span>, <span class="dv">3035619472</span>], <span class="op">\</span></span>
+<span id="cb108-394"><a href="#cb108-394" aria-hidden="true" tabindex="-1"></a>             <span class="st">"Question"</span>:[<span class="st">"HW 2 Q1"</span>, <span class="st">"HW 2 Q3"</span>, <span class="st">"Lab 3 Q4"</span>, <span class="st">"HW 2 Q7"</span>]})</span>
+<span id="cb108-395"><a href="#cb108-395" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-396"><a href="#cb108-396" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-397"><a href="#cb108-397" aria-hidden="true" tabindex="-1"></a><span class="fu">### Variable Types</span></span>
+<span id="cb108-398"><a href="#cb108-398" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-399"><a href="#cb108-399" aria-hidden="true" tabindex="-1"></a>Variables are columns. A variable is a measurement of a particular concept. Variables have two common properties: data type/storage type and variable type/feature type. The data type of a variable indicates how each variable value is stored in memory (integer, floating point, boolean, etc.) and affects which <span class="in">`pandas`</span> functions are used. The variable type is a conceptualized measurement of information (and therefore indicates what values a variable can take on). Variable type is identified through expert knowledge, exploring the data itself, or consulting the data codebook. The variable type affects how one visualizes and inteprets the data. In this class, "variable types" are conceptual.</span>
+<span id="cb108-400"><a href="#cb108-400" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-401"><a href="#cb108-401" aria-hidden="true" tabindex="-1"></a>After loading data into a file, it's a good idea to take the time to understand what pieces of information are encoded in the dataset. In particular, we want to identify what variable types are present in our data. Broadly speaking, we can categorize variables into one of two overarching types. </span>
+<span id="cb108-402"><a href="#cb108-402" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-403"><a href="#cb108-403" aria-hidden="true" tabindex="-1"></a>**Quantitative variables** describe some numeric quantity or amount. We can divide quantitative data further into:</span>
+<span id="cb108-404"><a href="#cb108-404" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-405"><a href="#cb108-405" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Continuous quantitative variables**: numeric data that can be measured on a continuous scale to arbitrary precision. Continuous variables do not have a strict set of possible values – they can be recorded to any number of decimal places. For example, weights, GPA, or CO&lt;sub&gt;2&lt;/sub&gt; concentrations.</span>
+<span id="cb108-406"><a href="#cb108-406" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Discrete quantitative variables**: numeric data that can only take on a finite set of possible values. For example, someone's age or the number of siblings they have.</span>
+<span id="cb108-407"><a href="#cb108-407" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-408"><a href="#cb108-408" aria-hidden="true" tabindex="-1"></a>**Qualitative variables**, also known as **categorical variables**, describe data that isn't measuring some quantity or amount. The sub-categories of categorical data are:</span>
+<span id="cb108-409"><a href="#cb108-409" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-410"><a href="#cb108-410" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Ordinal qualitative variables**: categories with ordered levels. Specifically, ordinal variables are those where the difference between levels has no consistent, quantifiable meaning. Some examples include levels of education (high school, undergrad, grad, etc.), income bracket (low, medium, high), or Yelp rating. </span>
+<span id="cb108-411"><a href="#cb108-411" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Nominal qualitative variables**: categories with no specific order. For example, someone's political affiliation or Cal ID number.</span>
+<span id="cb108-412"><a href="#cb108-412" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-413"><a href="#cb108-413" aria-hidden="true" tabindex="-1"></a><span class="al">![Classification of variable types](images/variable.png)</span></span>
+<span id="cb108-414"><a href="#cb108-414" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-415"><a href="#cb108-415" aria-hidden="true" tabindex="-1"></a>Note that many variables don't sit neatly in just one of these categories. Qualitative variables could have numeric levels, and conversely, quantitative variables could be stored as strings. </span>
+<span id="cb108-416"><a href="#cb108-416" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-417"><a href="#cb108-417" aria-hidden="true" tabindex="-1"></a><span class="fu">## Granularity, Scope, and Temporality</span></span>
+<span id="cb108-418"><a href="#cb108-418" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-419"><a href="#cb108-419" aria-hidden="true" tabindex="-1"></a>After understanding the structure of the dataset, the next task is to determine what exactly the data represents. We'll do so by considering the data's granularity, scope, and temporality.</span>
+<span id="cb108-420"><a href="#cb108-420" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-421"><a href="#cb108-421" aria-hidden="true" tabindex="-1"></a><span class="fu">### Granularity</span></span>
+<span id="cb108-422"><a href="#cb108-422" aria-hidden="true" tabindex="-1"></a>The **granularity** of a dataset is what a single row represents. You can also think of it as the level of detail included in the data. To determine the data's granularity, ask: what does each row in the dataset represent? Fine-grained data contains a high level of detail, with a single row representing a small individual unit. For example, each record may represent one person. Coarse-grained data is encoded such that a single row represents a large individual unit – for example, each record may represent a group of people.</span>
+<span id="cb108-423"><a href="#cb108-423" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-424"><a href="#cb108-424" aria-hidden="true" tabindex="-1"></a><span class="fu">### Scope</span></span>
+<span id="cb108-425"><a href="#cb108-425" aria-hidden="true" tabindex="-1"></a>The **scope** of a dataset is the subset of the population covered by the data. If we were investigating student performance in Data Science courses, a dataset with a narrow scope might encompass all students enrolled in Data 100 whereas a dataset with an expansive scope might encompass all students in California. </span>
+<span id="cb108-426"><a href="#cb108-426" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-427"><a href="#cb108-427" aria-hidden="true" tabindex="-1"></a><span class="fu">### Temporality</span></span>
+<span id="cb108-428"><a href="#cb108-428" aria-hidden="true" tabindex="-1"></a>The **temporality** of a dataset describes the periodicity over which the data was collected as well as when the data was most recently collected or updated. </span>
+<span id="cb108-429"><a href="#cb108-429" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-430"><a href="#cb108-430" aria-hidden="true" tabindex="-1"></a>Time and date fields of a dataset could represent a few things:</span>
+<span id="cb108-431"><a href="#cb108-431" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-432"><a href="#cb108-432" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>when the "event" happened</span>
+<span id="cb108-433"><a href="#cb108-433" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>when the data was collected, or when it was entered into the system</span>
+<span id="cb108-434"><a href="#cb108-434" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>when the data was copied into the database </span>
+<span id="cb108-435"><a href="#cb108-435" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-436"><a href="#cb108-436" aria-hidden="true" tabindex="-1"></a>To fully understand the temporality of the data, it also may be necessary to standardize time zones or inspect recurring time-based trends in the data (do patterns recur in 24-hour periods? Over the course of a month? Seasonally?). The convention for standardizing time is the Coordinated Universal Time (UTC), an international time standard measured at 0 degrees latitude that stays consistent throughout the year (no daylight savings). We can represent Berkeley's time zone, Pacific Standard Time (PST), as UTC-7 (with daylight savings). </span>
+<span id="cb108-437"><a href="#cb108-437" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-438"><a href="#cb108-438" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Temporality with `pandas`' `dt` accessors </span></span>
+<span id="cb108-439"><a href="#cb108-439" aria-hidden="true" tabindex="-1"></a>Let's briefly look at how we can use <span class="in">`pandas`</span>' <span class="in">`dt`</span> accessors to work with dates/times in a dataset using the dataset you'll see in Lab 3: the Berkeley PD Calls for Service dataset.</span>
+<span id="cb108-440"><a href="#cb108-440" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-443"><a href="#cb108-443" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-444"><a href="#cb108-444" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-445"><a href="#cb108-445" aria-hidden="true" tabindex="-1"></a>calls <span class="op">=</span> pd.read_csv(<span class="st">"data/Berkeley_PD_-_Calls_for_Service.csv"</span>)</span>
+<span id="cb108-446"><a href="#cb108-446" aria-hidden="true" tabindex="-1"></a>calls.head()</span>
+<span id="cb108-447"><a href="#cb108-447" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-448"><a href="#cb108-448" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-449"><a href="#cb108-449" aria-hidden="true" tabindex="-1"></a>Looks like there are three columns with dates/times: <span class="in">`EVENTDT`</span>, <span class="in">`EVENTTM`</span>, and <span class="in">`InDbDate`</span>. </span>
+<span id="cb108-450"><a href="#cb108-450" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-451"><a href="#cb108-451" aria-hidden="true" tabindex="-1"></a>Most likely, <span class="in">`EVENTDT`</span> stands for the date when the event took place, <span class="in">`EVENTTM`</span> stands for the time of day the event took place (in 24-hr format), and <span class="in">`InDbDate`</span> is the date this call is recorded onto the database.</span>
+<span id="cb108-452"><a href="#cb108-452" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-453"><a href="#cb108-453" aria-hidden="true" tabindex="-1"></a>If we check the data type of these columns, we will see they are stored as strings. We can convert them to <span class="in">`datetime`</span> objects using pandas <span class="in">`to_datetime`</span> function.</span>
+<span id="cb108-454"><a href="#cb108-454" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-457"><a href="#cb108-457" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-458"><a href="#cb108-458" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-459"><a href="#cb108-459" aria-hidden="true" tabindex="-1"></a>calls[<span class="st">"EVENTDT"</span>] <span class="op">=</span> pd.to_datetime(calls[<span class="st">"EVENTDT"</span>])</span>
+<span id="cb108-460"><a href="#cb108-460" aria-hidden="true" tabindex="-1"></a>calls.head()</span>
+<span id="cb108-461"><a href="#cb108-461" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-462"><a href="#cb108-462" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-463"><a href="#cb108-463" aria-hidden="true" tabindex="-1"></a>Now, we can use the <span class="in">`dt`</span> accessor on this column.</span>
+<span id="cb108-464"><a href="#cb108-464" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-465"><a href="#cb108-465" aria-hidden="true" tabindex="-1"></a>We can get the month: </span>
+<span id="cb108-466"><a href="#cb108-466" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-469"><a href="#cb108-469" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-470"><a href="#cb108-470" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-471"><a href="#cb108-471" aria-hidden="true" tabindex="-1"></a>calls[<span class="st">"EVENTDT"</span>].dt.month.head()</span>
+<span id="cb108-472"><a href="#cb108-472" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-473"><a href="#cb108-473" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-474"><a href="#cb108-474" aria-hidden="true" tabindex="-1"></a>Which day of the week the date is on:</span>
+<span id="cb108-475"><a href="#cb108-475" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-478"><a href="#cb108-478" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-479"><a href="#cb108-479" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-480"><a href="#cb108-480" aria-hidden="true" tabindex="-1"></a>calls[<span class="st">"EVENTDT"</span>].dt.dayofweek.head()</span>
+<span id="cb108-481"><a href="#cb108-481" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-482"><a href="#cb108-482" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-483"><a href="#cb108-483" aria-hidden="true" tabindex="-1"></a>Check the mimimum values to see if there are any suspicious-looking, 70s dates:</span>
+<span id="cb108-484"><a href="#cb108-484" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-487"><a href="#cb108-487" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-488"><a href="#cb108-488" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-489"><a href="#cb108-489" aria-hidden="true" tabindex="-1"></a>calls.sort_values(<span class="st">"EVENTDT"</span>).head()</span>
+<span id="cb108-490"><a href="#cb108-490" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-491"><a href="#cb108-491" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-492"><a href="#cb108-492" aria-hidden="true" tabindex="-1"></a>Doesn't look like it! We are good!</span>
+<span id="cb108-493"><a href="#cb108-493" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-494"><a href="#cb108-494" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-495"><a href="#cb108-495" aria-hidden="true" tabindex="-1"></a>We can also do many things with the <span class="in">`dt`</span> accessor like switching time zones and converting time back to UNIX/POSIX time. Check out the documentation on <span class="co">[</span><span class="ot">`.dt` accessor</span><span class="co">](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dt-accessors)</span> and <span class="co">[</span><span class="ot">time series/date functionality</span><span class="co">](https://pandas.pydata.org/docs/user_guide/timeseries.html#)</span>.</span>
+<span id="cb108-496"><a href="#cb108-496" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-497"><a href="#cb108-497" aria-hidden="true" tabindex="-1"></a><span class="fu">## Faithfulness</span></span>
+<span id="cb108-498"><a href="#cb108-498" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-499"><a href="#cb108-499" aria-hidden="true" tabindex="-1"></a>At this stage in our data cleaning and EDA workflow, we've achieved quite a lot: we've identified how our data is structured, come to terms with what information it encodes, and gained insight as to how it was generated. Throughout this process, we should always recall the original intent of our work in Data Science – to use data to better understand and model the real world. To achieve this goal, we need to ensure that the data we use is faithful to reality; that is, that our data accurately captures the "real world."</span>
+<span id="cb108-500"><a href="#cb108-500" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-501"><a href="#cb108-501" aria-hidden="true" tabindex="-1"></a>Data used in research or industry is often "messy" – there may be errors or inaccuracies that impact the faithfulness of the dataset. Signs that data may not be faithful include:</span>
+<span id="cb108-502"><a href="#cb108-502" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-503"><a href="#cb108-503" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Unrealistic or "incorrect" values, such as negative counts, locations that don't exist, or dates set in the future</span>
+<span id="cb108-504"><a href="#cb108-504" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Violations of obvious dependencies, like an age that does not match a birthday</span>
+<span id="cb108-505"><a href="#cb108-505" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Clear signs that data was entered by hand, which can lead to spelling errors or fields that are incorrectly shifted</span>
+<span id="cb108-506"><a href="#cb108-506" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Signs of data falsification, such as fake email addresses or repeated use of the same names</span>
+<span id="cb108-507"><a href="#cb108-507" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Duplicated records or fields containing the same information</span>
+<span id="cb108-508"><a href="#cb108-508" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Truncated data, e.g. Microsoft Excel would limit the number of rows to 655536 and the number of columns to 255</span>
+<span id="cb108-509"><a href="#cb108-509" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-510"><a href="#cb108-510" aria-hidden="true" tabindex="-1"></a>We often solve some of these more common issues in the following ways: </span>
+<span id="cb108-511"><a href="#cb108-511" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-512"><a href="#cb108-512" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Spelling errors: apply corrections or drop records that aren't in a dictionary</span>
+<span id="cb108-513"><a href="#cb108-513" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Time zone inconsistencies: convert to a common time zone (e.g. UTC) </span>
+<span id="cb108-514"><a href="#cb108-514" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Duplicated records or fields: identify and eliminate duplicates (using primary keys)</span>
+<span id="cb108-515"><a href="#cb108-515" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Unspecified or inconsistent units: infer the units and check that values are in reasonable ranges in the data</span>
+<span id="cb108-516"><a href="#cb108-516" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-517"><a href="#cb108-517" aria-hidden="true" tabindex="-1"></a><span class="fu">### Missing Values</span></span>
+<span id="cb108-518"><a href="#cb108-518" aria-hidden="true" tabindex="-1"></a>Another common issue encountered with real-world datasets is that of missing data. One strategy to resolve this is to simply drop any records with missing values from the dataset. This does, however, introduce the risk of inducing biases – it is possible that the missing or corrupt records may be systemically related to some feature of interest in the data. Another solution is to keep the data as <span class="in">`NaN`</span> values. </span>
+<span id="cb108-519"><a href="#cb108-519" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-520"><a href="#cb108-520" aria-hidden="true" tabindex="-1"></a>A third method to address missing data is to perform **imputation**: infer the missing values using other data available in the dataset. There is a wide variety of imputation techniques that can be implemented; some of the most common are listed below.</span>
+<span id="cb108-521"><a href="#cb108-521" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-522"><a href="#cb108-522" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Average imputation: replace missing values with the average value for that field</span>
+<span id="cb108-523"><a href="#cb108-523" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Hot deck imputation: replace missing values with some random value</span>
+<span id="cb108-524"><a href="#cb108-524" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Regression imputation: develop a model to predict missing values and replace with the predicted value from the model.</span>
+<span id="cb108-525"><a href="#cb108-525" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Multiple imputation: replace missing values with multiple random values</span>
+<span id="cb108-526"><a href="#cb108-526" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-527"><a href="#cb108-527" aria-hidden="true" tabindex="-1"></a>Regardless of the strategy used to deal with missing data, we should think carefully about *why* particular records or fields may be missing – this can help inform whether or not the absence of these values is significant or meaningful.</span>
+<span id="cb108-528"><a href="#cb108-528" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-529"><a href="#cb108-529" aria-hidden="true" tabindex="-1"></a><span class="fu">## EDA Demo 1: Tuberculosis in the United States</span></span>
+<span id="cb108-530"><a href="#cb108-530" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-531"><a href="#cb108-531" aria-hidden="true" tabindex="-1"></a>Now, let's walk through the data-cleaning and EDA workflow to see what can we learn about the presence of Tuberculosis in the United States!</span>
+<span id="cb108-532"><a href="#cb108-532" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-533"><a href="#cb108-533" aria-hidden="true" tabindex="-1"></a>We will examine the data included in the <span class="co">[</span><span class="ot">original CDC article</span><span class="co">](https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w#T1_down)</span> published in 2021.</span>
+<span id="cb108-534"><a href="#cb108-534" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-535"><a href="#cb108-535" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-536"><a href="#cb108-536" aria-hidden="true" tabindex="-1"></a><span class="fu">### CSVs and Field Names</span></span>
+<span id="cb108-537"><a href="#cb108-537" aria-hidden="true" tabindex="-1"></a>Suppose Table 1 was saved as a CSV file located in <span class="in">`data/cdc_tuberculosis.csv`</span>.</span>
+<span id="cb108-538"><a href="#cb108-538" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-539"><a href="#cb108-539" aria-hidden="true" tabindex="-1"></a>We can then explore the CSV (which is a text file, and does not contain binary-encoded data) in many ways:</span>
+<span id="cb108-540"><a href="#cb108-540" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Using a text editor like emacs, vim, VSCode, etc.</span>
+<span id="cb108-541"><a href="#cb108-541" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Opening the CSV directly in DataHub (read-only), Excel, Google Sheets, etc.</span>
+<span id="cb108-542"><a href="#cb108-542" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>The <span class="in">`Python`</span> file object</span>
+<span id="cb108-543"><a href="#cb108-543" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span><span class="in">`pandas`</span>, using <span class="in">`pd.read_csv()`</span></span>
+<span id="cb108-544"><a href="#cb108-544" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-545"><a href="#cb108-545" aria-hidden="true" tabindex="-1"></a>To try out options 1 and 2, you can view or download the Tuberculosis from the <span class="co">[</span><span class="ot">lecture demo notebook</span><span class="co">](https://data100.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FDS-100%2Ffa23-student&amp;urlpath=lab%2Ftree%2Ffa23-student%2Flecture%2Flec05%2Flec04-eda.ipynb&amp;branch=main)</span> under the <span class="in">`data`</span> folder in the left hand menu. Notice how the CSV file is a type of **rectangular data (i.e., tabular data) stored as comma-separated values**.</span>
+<span id="cb108-546"><a href="#cb108-546" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-547"><a href="#cb108-547" aria-hidden="true" tabindex="-1"></a>Next, let's try out option 3 using the <span class="in">`Python`</span> file object. We'll look at the first four lines:</span>
+<span id="cb108-548"><a href="#cb108-548" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-551"><a href="#cb108-551" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-552"><a href="#cb108-552" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-553"><a href="#cb108-553" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/cdc_tuberculosis.csv"</span>, <span class="st">"r"</span>) <span class="im">as</span> f:</span>
+<span id="cb108-554"><a href="#cb108-554" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb108-555"><a href="#cb108-555" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> f:</span>
+<span id="cb108-556"><a href="#cb108-556" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(row)</span>
+<span id="cb108-557"><a href="#cb108-557" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb108-558"><a href="#cb108-558" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">3</span>:</span>
+<span id="cb108-559"><a href="#cb108-559" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span>
+<span id="cb108-560"><a href="#cb108-560" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-561"><a href="#cb108-561" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-562"><a href="#cb108-562" aria-hidden="true" tabindex="-1"></a>Whoa, why are there blank lines interspaced between the lines of the CSV?</span>
+<span id="cb108-563"><a href="#cb108-563" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-564"><a href="#cb108-564" aria-hidden="true" tabindex="-1"></a>You may recall that all line breaks in text files are encoded as the special newline character <span class="in">`\n`</span>. Python's <span class="in">`print()`</span> prints each string (including the newline), and an additional newline on top of that.</span>
+<span id="cb108-565"><a href="#cb108-565" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-566"><a href="#cb108-566" aria-hidden="true" tabindex="-1"></a>If you're curious, we can use the <span class="in">`repr()`</span> function to return the raw string with all special characters:</span>
+<span id="cb108-567"><a href="#cb108-567" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-570"><a href="#cb108-570" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-571"><a href="#cb108-571" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-572"><a href="#cb108-572" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">"data/cdc_tuberculosis.csv"</span>, <span class="st">"r"</span>) <span class="im">as</span> f:</span>
+<span id="cb108-573"><a href="#cb108-573" aria-hidden="true" tabindex="-1"></a>    i <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb108-574"><a href="#cb108-574" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> row <span class="kw">in</span> f:</span>
+<span id="cb108-575"><a href="#cb108-575" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="bu">repr</span>(row)) <span class="co"># print raw strings</span></span>
+<span id="cb108-576"><a href="#cb108-576" aria-hidden="true" tabindex="-1"></a>        i <span class="op">+=</span> <span class="dv">1</span></span>
+<span id="cb108-577"><a href="#cb108-577" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> i <span class="op">&gt;</span> <span class="dv">3</span>:</span>
+<span id="cb108-578"><a href="#cb108-578" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span>
+<span id="cb108-579"><a href="#cb108-579" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-580"><a href="#cb108-580" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-581"><a href="#cb108-581" aria-hidden="true" tabindex="-1"></a>Finally, let's try option 4 and use the tried-and-true Data 100 approach: <span class="in">`pandas`</span>.</span>
+<span id="cb108-582"><a href="#cb108-582" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-585"><a href="#cb108-585" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-586"><a href="#cb108-586" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-587"><a href="#cb108-587" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> pd.read_csv(<span class="st">"data/cdc_tuberculosis.csv"</span>)</span>
+<span id="cb108-588"><a href="#cb108-588" aria-hidden="true" tabindex="-1"></a>tb_df.head()</span>
+<span id="cb108-589"><a href="#cb108-589" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-590"><a href="#cb108-590" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-591"><a href="#cb108-591" aria-hidden="true" tabindex="-1"></a>You may notice some strange things about this table: what's up with the "Unnamed" column names and the first row? </span>
+<span id="cb108-592"><a href="#cb108-592" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-593"><a href="#cb108-593" aria-hidden="true" tabindex="-1"></a>Congratulations — you're ready to wrangle your data! Because of how things are stored, we'll need to clean the data a bit to name our columns better.</span>
+<span id="cb108-594"><a href="#cb108-594" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-595"><a href="#cb108-595" aria-hidden="true" tabindex="-1"></a>A reasonable first step is to identify the row with the right header. The <span class="in">`pd.read_csv()`</span> function (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)</span>) has the convenient <span class="in">`header`</span> parameter that we can set to use the elements in row 1 as the appropriate columns:</span>
+<span id="cb108-596"><a href="#cb108-596" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-599"><a href="#cb108-599" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-600"><a href="#cb108-600" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-601"><a href="#cb108-601" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> pd.read_csv(<span class="st">"data/cdc_tuberculosis.csv"</span>, header<span class="op">=</span><span class="dv">1</span>) <span class="co"># row index</span></span>
+<span id="cb108-602"><a href="#cb108-602" aria-hidden="true" tabindex="-1"></a>tb_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-603"><a href="#cb108-603" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-604"><a href="#cb108-604" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-605"><a href="#cb108-605" aria-hidden="true" tabindex="-1"></a>Wait...but now we can't differentiate betwen the "Number of TB cases" and "TB incidence" year columns. <span class="in">`pandas`</span> has tried to make our lives easier by automatically adding ".1" to the latter columns, but this doesn't help us, as humans, understand the data.</span>
+<span id="cb108-606"><a href="#cb108-606" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-607"><a href="#cb108-607" aria-hidden="true" tabindex="-1"></a>We can do this manually with <span class="in">`df.rename()`</span> (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html?highlight=rename#pandas.DataFrame.rename)</span>):</span>
+<span id="cb108-608"><a href="#cb108-608" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-611"><a href="#cb108-611" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-612"><a href="#cb108-612" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-613"><a href="#cb108-613" aria-hidden="true" tabindex="-1"></a>rename_dict <span class="op">=</span> {<span class="st">'2019'</span>: <span class="st">'TB cases 2019'</span>,</span>
+<span id="cb108-614"><a href="#cb108-614" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2020'</span>: <span class="st">'TB cases 2020'</span>,</span>
+<span id="cb108-615"><a href="#cb108-615" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2021'</span>: <span class="st">'TB cases 2021'</span>,</span>
+<span id="cb108-616"><a href="#cb108-616" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2019.1'</span>: <span class="st">'TB incidence 2019'</span>,</span>
+<span id="cb108-617"><a href="#cb108-617" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2020.1'</span>: <span class="st">'TB incidence 2020'</span>,</span>
+<span id="cb108-618"><a href="#cb108-618" aria-hidden="true" tabindex="-1"></a>               <span class="st">'2021.1'</span>: <span class="st">'TB incidence 2021'</span>}</span>
+<span id="cb108-619"><a href="#cb108-619" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> tb_df.rename(columns<span class="op">=</span>rename_dict)</span>
+<span id="cb108-620"><a href="#cb108-620" aria-hidden="true" tabindex="-1"></a>tb_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-621"><a href="#cb108-621" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-622"><a href="#cb108-622" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-623"><a href="#cb108-623" aria-hidden="true" tabindex="-1"></a><span class="fu">### Record Granularity</span></span>
+<span id="cb108-624"><a href="#cb108-624" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-625"><a href="#cb108-625" aria-hidden="true" tabindex="-1"></a>You might already be wondering: what's up with that first record?</span>
+<span id="cb108-626"><a href="#cb108-626" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-627"><a href="#cb108-627" aria-hidden="true" tabindex="-1"></a>Row 0 is what we call a **rollup record**, or summary record. It's often useful when displaying tables to humans. The **granularity** of record 0 (Totals) vs the rest of the records (States) is different.</span>
+<span id="cb108-628"><a href="#cb108-628" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-629"><a href="#cb108-629" aria-hidden="true" tabindex="-1"></a>Okay, EDA step two. How was the rollup record aggregated?</span>
+<span id="cb108-630"><a href="#cb108-630" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-631"><a href="#cb108-631" aria-hidden="true" tabindex="-1"></a>Let's check if Total TB cases is the sum of all state TB cases. If we sum over all rows, we should get **2x** the total cases in each of our TB cases by year (why do you think this is?).</span>
+<span id="cb108-632"><a href="#cb108-632" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-635"><a href="#cb108-635" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-636"><a href="#cb108-636" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-637"><a href="#cb108-637" aria-hidden="true" tabindex="-1"></a>tb_df.<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb108-638"><a href="#cb108-638" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-639"><a href="#cb108-639" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-640"><a href="#cb108-640" aria-hidden="true" tabindex="-1"></a>Whoa, what's going on with the TB cases in 2019, 2020, and 2021? Check out the column types:</span>
+<span id="cb108-641"><a href="#cb108-641" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-644"><a href="#cb108-644" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-645"><a href="#cb108-645" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-646"><a href="#cb108-646" aria-hidden="true" tabindex="-1"></a>tb_df.dtypes</span>
+<span id="cb108-647"><a href="#cb108-647" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-648"><a href="#cb108-648" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-649"><a href="#cb108-649" aria-hidden="true" tabindex="-1"></a>Since there are commas in the values for TB cases, the numbers are read as the <span class="in">`object`</span> datatype, or **storage type** (close to the <span class="in">`Python`</span> string datatype), so <span class="in">`pandas`</span> is concatenating strings instead of adding integers (recall that Python can "sum", or concatenate, strings together: <span class="in">`"data" + "100"`</span> evaluates to <span class="in">`"data100"`</span>). </span>
+<span id="cb108-650"><a href="#cb108-650" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-651"><a href="#cb108-651" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-652"><a href="#cb108-652" aria-hidden="true" tabindex="-1"></a>Fortunately <span class="in">`read_csv`</span> also has a <span class="in">`thousands`</span> parameter (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)</span>):</span>
+<span id="cb108-653"><a href="#cb108-653" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-656"><a href="#cb108-656" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-657"><a href="#cb108-657" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-658"><a href="#cb108-658" aria-hidden="true" tabindex="-1"></a><span class="co"># improve readability: chaining method calls with outer parentheses/line breaks</span></span>
+<span id="cb108-659"><a href="#cb108-659" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> (</span>
+<span id="cb108-660"><a href="#cb108-660" aria-hidden="true" tabindex="-1"></a>    pd.read_csv(<span class="st">"data/cdc_tuberculosis.csv"</span>, header<span class="op">=</span><span class="dv">1</span>, thousands<span class="op">=</span><span class="st">','</span>)</span>
+<span id="cb108-661"><a href="#cb108-661" aria-hidden="true" tabindex="-1"></a>    .rename(columns<span class="op">=</span>rename_dict)</span>
+<span id="cb108-662"><a href="#cb108-662" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb108-663"><a href="#cb108-663" aria-hidden="true" tabindex="-1"></a>tb_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-664"><a href="#cb108-664" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-665"><a href="#cb108-665" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-668"><a href="#cb108-668" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-669"><a href="#cb108-669" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-670"><a href="#cb108-670" aria-hidden="true" tabindex="-1"></a>tb_df.<span class="bu">sum</span>()</span>
+<span id="cb108-671"><a href="#cb108-671" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-672"><a href="#cb108-672" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-673"><a href="#cb108-673" aria-hidden="true" tabindex="-1"></a>The total TB cases look right. Phew!</span>
+<span id="cb108-674"><a href="#cb108-674" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-675"><a href="#cb108-675" aria-hidden="true" tabindex="-1"></a>Let's just look at the records with **state-level granularity**:</span>
+<span id="cb108-676"><a href="#cb108-676" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-679"><a href="#cb108-679" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-680"><a href="#cb108-680" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-681"><a href="#cb108-681" aria-hidden="true" tabindex="-1"></a>state_tb_df <span class="op">=</span> tb_df[<span class="dv">1</span>:]</span>
+<span id="cb108-682"><a href="#cb108-682" aria-hidden="true" tabindex="-1"></a>state_tb_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-683"><a href="#cb108-683" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-684"><a href="#cb108-684" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-685"><a href="#cb108-685" aria-hidden="true" tabindex="-1"></a><span class="fu">### Gather Census Data</span></span>
+<span id="cb108-686"><a href="#cb108-686" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-687"><a href="#cb108-687" aria-hidden="true" tabindex="-1"></a>U.S. Census population estimates <span class="co">[</span><span class="ot">source</span><span class="co">](https://www.census.gov/data/tables/time-series/demo/popest/2010s-state-total.html)</span> (2019), <span class="co">[</span><span class="ot">source</span><span class="co">](https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-total.html)</span> (2020-2021).</span>
+<span id="cb108-688"><a href="#cb108-688" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-689"><a href="#cb108-689" aria-hidden="true" tabindex="-1"></a>Running the below cells cleans the data.</span>
+<span id="cb108-690"><a href="#cb108-690" aria-hidden="true" tabindex="-1"></a>There are a few new methods here:</span>
+<span id="cb108-691"><a href="#cb108-691" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`df.convert_dtypes()`</span> (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.convert_dtypes.html)</span>) conveniently converts all float dtypes into ints and is out of scope for the class.</span>
+<span id="cb108-692"><a href="#cb108-692" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`df.drop_na()`</span> (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html)</span>) will be explained in more detail next time.</span>
+<span id="cb108-693"><a href="#cb108-693" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-696"><a href="#cb108-696" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-697"><a href="#cb108-697" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-698"><a href="#cb108-698" aria-hidden="true" tabindex="-1"></a><span class="co"># 2010s census data</span></span>
+<span id="cb108-699"><a href="#cb108-699" aria-hidden="true" tabindex="-1"></a>census_2010s_df <span class="op">=</span> pd.read_csv(<span class="st">"data/nst-est2019-01.csv"</span>, header<span class="op">=</span><span class="dv">3</span>, thousands<span class="op">=</span><span class="st">","</span>)</span>
+<span id="cb108-700"><a href="#cb108-700" aria-hidden="true" tabindex="-1"></a>census_2010s_df <span class="op">=</span> (</span>
+<span id="cb108-701"><a href="#cb108-701" aria-hidden="true" tabindex="-1"></a>    census_2010s_df</span>
+<span id="cb108-702"><a href="#cb108-702" aria-hidden="true" tabindex="-1"></a>    .reset_index()</span>
+<span id="cb108-703"><a href="#cb108-703" aria-hidden="true" tabindex="-1"></a>    .drop(columns<span class="op">=</span>[<span class="st">"index"</span>, <span class="st">"Census"</span>, <span class="st">"Estimates Base"</span>])</span>
+<span id="cb108-704"><a href="#cb108-704" aria-hidden="true" tabindex="-1"></a>    .rename(columns<span class="op">=</span>{<span class="st">"Unnamed: 0"</span>: <span class="st">"Geographic Area"</span>})</span>
+<span id="cb108-705"><a href="#cb108-705" aria-hidden="true" tabindex="-1"></a>    .convert_dtypes()                 <span class="co"># "smart" converting of columns, use at your own risk</span></span>
+<span id="cb108-706"><a href="#cb108-706" aria-hidden="true" tabindex="-1"></a>    .dropna()                         <span class="co"># we'll introduce this next time</span></span>
+<span id="cb108-707"><a href="#cb108-707" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb108-708"><a href="#cb108-708" aria-hidden="true" tabindex="-1"></a>census_2010s_df[<span class="st">'Geographic Area'</span>] <span class="op">=</span> census_2010s_df[<span class="st">'Geographic Area'</span>].<span class="bu">str</span>.strip(<span class="st">'.'</span>)</span>
+<span id="cb108-709"><a href="#cb108-709" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-710"><a href="#cb108-710" aria-hidden="true" tabindex="-1"></a><span class="co"># with pd.option_context('display.min_rows', 30): # shows more rows</span></span>
+<span id="cb108-711"><a href="#cb108-711" aria-hidden="true" tabindex="-1"></a><span class="co">#     display(census_2010s_df)</span></span>
+<span id="cb108-712"><a href="#cb108-712" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb108-713"><a href="#cb108-713" aria-hidden="true" tabindex="-1"></a>census_2010s_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-714"><a href="#cb108-714" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-715"><a href="#cb108-715" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-716"><a href="#cb108-716" aria-hidden="true" tabindex="-1"></a>Occasionally, you will want to modify code that you have imported.  To reimport those modifications you can either use <span class="in">`python`</span>'s <span class="in">`importlib`</span> library:</span>
+<span id="cb108-717"><a href="#cb108-717" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-718"><a href="#cb108-718" aria-hidden="true" tabindex="-1"></a><span class="in">```python</span></span>
+<span id="cb108-719"><a href="#cb108-719" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> importlib <span class="im">import</span> <span class="bu">reload</span></span>
+<span id="cb108-720"><a href="#cb108-720" aria-hidden="true" tabindex="-1"></a><span class="bu">reload</span>(utils)</span>
+<span id="cb108-721"><a href="#cb108-721" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-722"><a href="#cb108-722" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-723"><a href="#cb108-723" aria-hidden="true" tabindex="-1"></a>or use <span class="in">`iPython`</span> magic which will intelligently import code when files change:</span>
+<span id="cb108-724"><a href="#cb108-724" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-725"><a href="#cb108-725" aria-hidden="true" tabindex="-1"></a><span class="in">```python</span></span>
+<span id="cb108-726"><a href="#cb108-726" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>load_ext autoreload</span>
+<span id="cb108-727"><a href="#cb108-727" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>autoreload <span class="dv">2</span></span>
+<span id="cb108-728"><a href="#cb108-728" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-729"><a href="#cb108-729" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-732"><a href="#cb108-732" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-733"><a href="#cb108-733" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-734"><a href="#cb108-734" aria-hidden="true" tabindex="-1"></a><span class="co"># census 2020s data</span></span>
+<span id="cb108-735"><a href="#cb108-735" aria-hidden="true" tabindex="-1"></a>census_2020s_df <span class="op">=</span> pd.read_csv(<span class="st">"data/NST-EST2022-POP.csv"</span>, header<span class="op">=</span><span class="dv">3</span>, thousands<span class="op">=</span><span class="st">","</span>)</span>
+<span id="cb108-736"><a href="#cb108-736" aria-hidden="true" tabindex="-1"></a>census_2020s_df <span class="op">=</span> (</span>
+<span id="cb108-737"><a href="#cb108-737" aria-hidden="true" tabindex="-1"></a>    census_2020s_df</span>
+<span id="cb108-738"><a href="#cb108-738" aria-hidden="true" tabindex="-1"></a>    .reset_index()</span>
+<span id="cb108-739"><a href="#cb108-739" aria-hidden="true" tabindex="-1"></a>    .drop(columns<span class="op">=</span>[<span class="st">"index"</span>, <span class="st">"Unnamed: 1"</span>])</span>
+<span id="cb108-740"><a href="#cb108-740" aria-hidden="true" tabindex="-1"></a>    .rename(columns<span class="op">=</span>{<span class="st">"Unnamed: 0"</span>: <span class="st">"Geographic Area"</span>})</span>
+<span id="cb108-741"><a href="#cb108-741" aria-hidden="true" tabindex="-1"></a>    .convert_dtypes()                 <span class="co"># "smart" converting of columns, use at your own risk</span></span>
+<span id="cb108-742"><a href="#cb108-742" aria-hidden="true" tabindex="-1"></a>    .dropna()                         <span class="co"># we'll introduce this next time</span></span>
+<span id="cb108-743"><a href="#cb108-743" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb108-744"><a href="#cb108-744" aria-hidden="true" tabindex="-1"></a>census_2020s_df[<span class="st">'Geographic Area'</span>] <span class="op">=</span> census_2020s_df[<span class="st">'Geographic Area'</span>].<span class="bu">str</span>.strip(<span class="st">'.'</span>)</span>
+<span id="cb108-745"><a href="#cb108-745" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-746"><a href="#cb108-746" aria-hidden="true" tabindex="-1"></a>census_2020s_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-747"><a href="#cb108-747" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-748"><a href="#cb108-748" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-749"><a href="#cb108-749" aria-hidden="true" tabindex="-1"></a><span class="fu">### Joining Data (Merging `DataFrame`s)</span></span>
+<span id="cb108-750"><a href="#cb108-750" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-751"><a href="#cb108-751" aria-hidden="true" tabindex="-1"></a>Time to <span class="in">`merge`</span>! Here we use the <span class="in">`DataFrame`</span> method <span class="in">`df1.merge(right=df2, ...)`</span> on <span class="in">`DataFrame`</span> <span class="in">`df1`</span> (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html)</span>). Contrast this with the function <span class="in">`pd.merge(left=df1, right=df2, ...)`</span> (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.merge.html?highlight=pandas%20merge#pandas.merge)</span>). Feel free to use either.</span>
+<span id="cb108-752"><a href="#cb108-752" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-755"><a href="#cb108-755" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-756"><a href="#cb108-756" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-757"><a href="#cb108-757" aria-hidden="true" tabindex="-1"></a><span class="co"># merge TB DataFrame with two US census DataFrames</span></span>
+<span id="cb108-758"><a href="#cb108-758" aria-hidden="true" tabindex="-1"></a>tb_census_df <span class="op">=</span> (</span>
+<span id="cb108-759"><a href="#cb108-759" aria-hidden="true" tabindex="-1"></a>    tb_df</span>
+<span id="cb108-760"><a href="#cb108-760" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2010s_df,</span>
+<span id="cb108-761"><a href="#cb108-761" aria-hidden="true" tabindex="-1"></a>           left_on<span class="op">=</span><span class="st">"U.S. jurisdiction"</span>, right_on<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb108-762"><a href="#cb108-762" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2020s_df,</span>
+<span id="cb108-763"><a href="#cb108-763" aria-hidden="true" tabindex="-1"></a>           left_on<span class="op">=</span><span class="st">"U.S. jurisdiction"</span>, right_on<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb108-764"><a href="#cb108-764" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb108-765"><a href="#cb108-765" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-766"><a href="#cb108-766" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-767"><a href="#cb108-767" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-768"><a href="#cb108-768" aria-hidden="true" tabindex="-1"></a>Having all of these columns is a little unwieldy. We could either drop the unneeded columns now, or just merge on smaller census <span class="in">`DataFrame`</span>s. Let's do the latter.</span>
+<span id="cb108-769"><a href="#cb108-769" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-772"><a href="#cb108-772" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-773"><a href="#cb108-773" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-774"><a href="#cb108-774" aria-hidden="true" tabindex="-1"></a><span class="co"># try merging again, but cleaner this time</span></span>
+<span id="cb108-775"><a href="#cb108-775" aria-hidden="true" tabindex="-1"></a>tb_census_df <span class="op">=</span> (</span>
+<span id="cb108-776"><a href="#cb108-776" aria-hidden="true" tabindex="-1"></a>    tb_df</span>
+<span id="cb108-777"><a href="#cb108-777" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2010s_df[[<span class="st">"Geographic Area"</span>, <span class="st">"2019"</span>]],</span>
+<span id="cb108-778"><a href="#cb108-778" aria-hidden="true" tabindex="-1"></a>           left_on<span class="op">=</span><span class="st">"U.S. jurisdiction"</span>, right_on<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb108-779"><a href="#cb108-779" aria-hidden="true" tabindex="-1"></a>    .drop(columns<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb108-780"><a href="#cb108-780" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2020s_df[[<span class="st">"Geographic Area"</span>, <span class="st">"2020"</span>, <span class="st">"2021"</span>]],</span>
+<span id="cb108-781"><a href="#cb108-781" aria-hidden="true" tabindex="-1"></a>           left_on<span class="op">=</span><span class="st">"U.S. jurisdiction"</span>, right_on<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb108-782"><a href="#cb108-782" aria-hidden="true" tabindex="-1"></a>    .drop(columns<span class="op">=</span><span class="st">"Geographic Area"</span>)</span>
+<span id="cb108-783"><a href="#cb108-783" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb108-784"><a href="#cb108-784" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-785"><a href="#cb108-785" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-786"><a href="#cb108-786" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-787"><a href="#cb108-787" aria-hidden="true" tabindex="-1"></a><span class="fu">### Reproducing Data: Compute Incidence</span></span>
+<span id="cb108-788"><a href="#cb108-788" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-789"><a href="#cb108-789" aria-hidden="true" tabindex="-1"></a>Let's recompute incidence to make sure we know where the original CDC numbers came from.</span>
+<span id="cb108-790"><a href="#cb108-790" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-791"><a href="#cb108-791" aria-hidden="true" tabindex="-1"></a>From the <span class="co">[</span><span class="ot">CDC report</span><span class="co">](https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w#T1_down)</span>: TB incidence is computed as “Cases per 100,000 persons using mid-year population estimates from the U.S. Census Bureau.”</span>
+<span id="cb108-792"><a href="#cb108-792" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-793"><a href="#cb108-793" aria-hidden="true" tabindex="-1"></a>If we define a group as 100,000 people, then we can compute the TB incidence for a given state population as</span>
+<span id="cb108-794"><a href="#cb108-794" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-795"><a href="#cb108-795" aria-hidden="true" tabindex="-1"></a>$$\text{TB incidence} = \frac{\text{TB cases in population}}{\text{groups in population}} = \frac{\text{TB cases in population}}{\text{population}/100000} $$</span>
+<span id="cb108-796"><a href="#cb108-796" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-797"><a href="#cb108-797" aria-hidden="true" tabindex="-1"></a>$$= \frac{\text{TB cases in population}}{\text{population}} \times 100000$$</span>
+<span id="cb108-798"><a href="#cb108-798" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-799"><a href="#cb108-799" aria-hidden="true" tabindex="-1"></a>Let's try this for 2019:</span>
+<span id="cb108-800"><a href="#cb108-800" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-803"><a href="#cb108-803" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-804"><a href="#cb108-804" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-805"><a href="#cb108-805" aria-hidden="true" tabindex="-1"></a>tb_census_df[<span class="st">"recompute incidence 2019"</span>] <span class="op">=</span> tb_census_df[<span class="st">"TB cases 2019"</span>]<span class="op">/</span>tb_census_df[<span class="st">"2019"</span>]<span class="op">*</span><span class="dv">100000</span></span>
+<span id="cb108-806"><a href="#cb108-806" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-807"><a href="#cb108-807" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-808"><a href="#cb108-808" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-809"><a href="#cb108-809" aria-hidden="true" tabindex="-1"></a>Awesome!!!</span>
+<span id="cb108-810"><a href="#cb108-810" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-811"><a href="#cb108-811" aria-hidden="true" tabindex="-1"></a>Let's use a for-loop and Python format strings to compute TB incidence for all years. Python f-strings are just used for the purposes of this demo, but they're handy to know when you explore data beyond this course (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://docs.python.org/3/tutorial/inputoutput.html)</span>).</span>
+<span id="cb108-812"><a href="#cb108-812" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-815"><a href="#cb108-815" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-816"><a href="#cb108-816" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-817"><a href="#cb108-817" aria-hidden="true" tabindex="-1"></a><span class="co"># recompute incidence for all years</span></span>
+<span id="cb108-818"><a href="#cb108-818" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> year <span class="kw">in</span> [<span class="dv">2019</span>, <span class="dv">2020</span>, <span class="dv">2021</span>]:</span>
+<span id="cb108-819"><a href="#cb108-819" aria-hidden="true" tabindex="-1"></a>    tb_census_df[<span class="ss">f"recompute incidence </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>] <span class="op">=</span> tb_census_df[<span class="ss">f"TB cases </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>]<span class="op">/</span>tb_census_df[<span class="ss">f"</span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>]<span class="op">*</span><span class="dv">100000</span></span>
+<span id="cb108-820"><a href="#cb108-820" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-821"><a href="#cb108-821" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-822"><a href="#cb108-822" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-823"><a href="#cb108-823" aria-hidden="true" tabindex="-1"></a>These numbers look pretty close!!! There are a few errors in the hundredths place, particularly in 2021. It may be useful to further explore reasons behind this discrepancy. </span>
+<span id="cb108-824"><a href="#cb108-824" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-827"><a href="#cb108-827" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-828"><a href="#cb108-828" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-829"><a href="#cb108-829" aria-hidden="true" tabindex="-1"></a>tb_census_df.describe()</span>
+<span id="cb108-830"><a href="#cb108-830" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-831"><a href="#cb108-831" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-832"><a href="#cb108-832" aria-hidden="true" tabindex="-1"></a><span class="fu">### Bonus EDA: Reproducing the Reported Statistic</span></span>
+<span id="cb108-833"><a href="#cb108-833" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-834"><a href="#cb108-834" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-835"><a href="#cb108-835" aria-hidden="true" tabindex="-1"></a>**How do we reproduce that reported statistic in the original [CDC report](https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w)?**</span>
+<span id="cb108-836"><a href="#cb108-836" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-837"><a href="#cb108-837" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; Reported TB incidence (cases per 100,000 persons) increased **9.4%**, from **2.2** during 2020 to **2.4** during 2021 but was lower than incidence during 2019 (2.7). Increases occurred among both U.S.-born and non–U.S.-born persons.</span></span>
+<span id="cb108-838"><a href="#cb108-838" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-839"><a href="#cb108-839" aria-hidden="true" tabindex="-1"></a>This is TB incidence computed across the entire U.S. population! How do we reproduce this?</span>
+<span id="cb108-840"><a href="#cb108-840" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>We need to reproduce the "Total" TB incidences in our rolled record.</span>
+<span id="cb108-841"><a href="#cb108-841" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>But our current <span class="in">`tb_census_df`</span> only has 51 entries (50 states plus Washington, D.C.). There is no rolled record.</span>
+<span id="cb108-842"><a href="#cb108-842" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>What happened...?</span>
+<span id="cb108-843"><a href="#cb108-843" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-844"><a href="#cb108-844" aria-hidden="true" tabindex="-1"></a>Let's get exploring!</span>
+<span id="cb108-845"><a href="#cb108-845" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-846"><a href="#cb108-846" aria-hidden="true" tabindex="-1"></a>Before we keep exploring, we'll set all indexes to more meaningful values, instead of just numbers that pertain to some row at some point. This will make our cleaning slightly easier.</span>
+<span id="cb108-847"><a href="#cb108-847" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-850"><a href="#cb108-850" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-851"><a href="#cb108-851" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-852"><a href="#cb108-852" aria-hidden="true" tabindex="-1"></a>tb_df <span class="op">=</span> tb_df.set_index(<span class="st">"U.S. jurisdiction"</span>)</span>
+<span id="cb108-853"><a href="#cb108-853" aria-hidden="true" tabindex="-1"></a>tb_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-854"><a href="#cb108-854" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-855"><a href="#cb108-855" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-858"><a href="#cb108-858" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-859"><a href="#cb108-859" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-860"><a href="#cb108-860" aria-hidden="true" tabindex="-1"></a>census_2010s_df <span class="op">=</span> census_2010s_df.set_index(<span class="st">"Geographic Area"</span>)</span>
+<span id="cb108-861"><a href="#cb108-861" aria-hidden="true" tabindex="-1"></a>census_2010s_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-862"><a href="#cb108-862" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-863"><a href="#cb108-863" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-866"><a href="#cb108-866" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-867"><a href="#cb108-867" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-868"><a href="#cb108-868" aria-hidden="true" tabindex="-1"></a>census_2020s_df <span class="op">=</span> census_2020s_df.set_index(<span class="st">"Geographic Area"</span>)</span>
+<span id="cb108-869"><a href="#cb108-869" aria-hidden="true" tabindex="-1"></a>census_2020s_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-870"><a href="#cb108-870" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-871"><a href="#cb108-871" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-872"><a href="#cb108-872" aria-hidden="true" tabindex="-1"></a>It turns out that our merge above only kept state records, even though our original <span class="in">`tb_df`</span> had the "Total" rolled record:</span>
+<span id="cb108-873"><a href="#cb108-873" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-876"><a href="#cb108-876" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-877"><a href="#cb108-877" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-878"><a href="#cb108-878" aria-hidden="true" tabindex="-1"></a>tb_df.head()</span>
+<span id="cb108-879"><a href="#cb108-879" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-880"><a href="#cb108-880" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-881"><a href="#cb108-881" aria-hidden="true" tabindex="-1"></a>Recall that <span class="in">`merge`</span> by default does an **inner** merge by default, meaning that it only preserves keys that are present in **both** <span class="in">`DataFrame`</span>s.</span>
+<span id="cb108-882"><a href="#cb108-882" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-883"><a href="#cb108-883" aria-hidden="true" tabindex="-1"></a>The rolled records in our census <span class="in">`DataFrame`</span> have different <span class="in">`Geographic Area`</span> fields, which was the key we merged on:</span>
+<span id="cb108-884"><a href="#cb108-884" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-887"><a href="#cb108-887" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-888"><a href="#cb108-888" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-889"><a href="#cb108-889" aria-hidden="true" tabindex="-1"></a>census_2010s_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-890"><a href="#cb108-890" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-891"><a href="#cb108-891" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-892"><a href="#cb108-892" aria-hidden="true" tabindex="-1"></a>The Census <span class="in">`DataFrame`</span> has several rolled records. The aggregate record we are looking for actually has the Geographic Area named "United States".</span>
+<span id="cb108-893"><a href="#cb108-893" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-894"><a href="#cb108-894" aria-hidden="true" tabindex="-1"></a>One straightforward way to get the right merge is to rename the value itself. Because we now have the Geographic Area index, we'll use <span class="in">`df.rename()`</span> (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html)</span>):</span>
+<span id="cb108-895"><a href="#cb108-895" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-898"><a href="#cb108-898" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-899"><a href="#cb108-899" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-900"><a href="#cb108-900" aria-hidden="true" tabindex="-1"></a><span class="co"># rename rolled record for 2010s</span></span>
+<span id="cb108-901"><a href="#cb108-901" aria-hidden="true" tabindex="-1"></a>census_2010s_df.rename(index<span class="op">=</span>{<span class="st">'United States'</span>:<span class="st">'Total'</span>}, inplace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb108-902"><a href="#cb108-902" aria-hidden="true" tabindex="-1"></a>census_2010s_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-903"><a href="#cb108-903" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-904"><a href="#cb108-904" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-907"><a href="#cb108-907" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-908"><a href="#cb108-908" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-909"><a href="#cb108-909" aria-hidden="true" tabindex="-1"></a><span class="co"># same, but for 2020s rename rolled record</span></span>
+<span id="cb108-910"><a href="#cb108-910" aria-hidden="true" tabindex="-1"></a>census_2020s_df.rename(index<span class="op">=</span>{<span class="st">'United States'</span>:<span class="st">'Total'</span>}, inplace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb108-911"><a href="#cb108-911" aria-hidden="true" tabindex="-1"></a>census_2020s_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-912"><a href="#cb108-912" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-913"><a href="#cb108-913" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-914"><a href="#cb108-914" aria-hidden="true" tabindex="-1"></a>&lt;br/&gt;</span>
+<span id="cb108-915"><a href="#cb108-915" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-916"><a href="#cb108-916" aria-hidden="true" tabindex="-1"></a>Next let's rerun our merge. Note the different chaining, because we are now merging on indexes (<span class="in">`df.merge()`</span> <span class="co">[</span><span class="ot">documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html)</span>).</span>
+<span id="cb108-917"><a href="#cb108-917" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-920"><a href="#cb108-920" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-921"><a href="#cb108-921" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-922"><a href="#cb108-922" aria-hidden="true" tabindex="-1"></a>tb_census_df <span class="op">=</span> (</span>
+<span id="cb108-923"><a href="#cb108-923" aria-hidden="true" tabindex="-1"></a>    tb_df</span>
+<span id="cb108-924"><a href="#cb108-924" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2010s_df[[<span class="st">"2019"</span>]],</span>
+<span id="cb108-925"><a href="#cb108-925" aria-hidden="true" tabindex="-1"></a>           left_index<span class="op">=</span><span class="va">True</span>, right_index<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb108-926"><a href="#cb108-926" aria-hidden="true" tabindex="-1"></a>    .merge(right<span class="op">=</span>census_2020s_df[[<span class="st">"2020"</span>, <span class="st">"2021"</span>]],</span>
+<span id="cb108-927"><a href="#cb108-927" aria-hidden="true" tabindex="-1"></a>           left_index<span class="op">=</span><span class="va">True</span>, right_index<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb108-928"><a href="#cb108-928" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb108-929"><a href="#cb108-929" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-930"><a href="#cb108-930" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-931"><a href="#cb108-931" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-932"><a href="#cb108-932" aria-hidden="true" tabindex="-1"></a>&lt;br/&gt;</span>
+<span id="cb108-933"><a href="#cb108-933" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-934"><a href="#cb108-934" aria-hidden="true" tabindex="-1"></a>Finally, let's recompute our incidences:</span>
+<span id="cb108-935"><a href="#cb108-935" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-938"><a href="#cb108-938" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-939"><a href="#cb108-939" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-940"><a href="#cb108-940" aria-hidden="true" tabindex="-1"></a><span class="co"># recompute incidence for all years</span></span>
+<span id="cb108-941"><a href="#cb108-941" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> year <span class="kw">in</span> [<span class="dv">2019</span>, <span class="dv">2020</span>, <span class="dv">2021</span>]:</span>
+<span id="cb108-942"><a href="#cb108-942" aria-hidden="true" tabindex="-1"></a>    tb_census_df[<span class="ss">f"recompute incidence </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>] <span class="op">=</span> tb_census_df[<span class="ss">f"TB cases </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>]<span class="op">/</span>tb_census_df[<span class="ss">f"</span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>]<span class="op">*</span><span class="dv">100000</span></span>
+<span id="cb108-943"><a href="#cb108-943" aria-hidden="true" tabindex="-1"></a>tb_census_df.head(<span class="dv">5</span>)</span>
+<span id="cb108-944"><a href="#cb108-944" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-945"><a href="#cb108-945" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-946"><a href="#cb108-946" aria-hidden="true" tabindex="-1"></a>We reproduced the total U.S. incidences correctly!</span>
+<span id="cb108-947"><a href="#cb108-947" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-948"><a href="#cb108-948" aria-hidden="true" tabindex="-1"></a>We're almost there. Let's revisit the quote:</span>
+<span id="cb108-949"><a href="#cb108-949" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-950"><a href="#cb108-950" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; Reported TB incidence (cases per 100,000 persons) increased **9.4%**, from **2.2** during 2020 to **2.4** during 2021 but was lower than incidence during 2019 (2.7). Increases occurred among both U.S.-born and non–U.S.-born persons.</span></span>
+<span id="cb108-951"><a href="#cb108-951" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-952"><a href="#cb108-952" aria-hidden="true" tabindex="-1"></a>Recall that percent change from $A$ to $B$ is computed as</span>
+<span id="cb108-953"><a href="#cb108-953" aria-hidden="true" tabindex="-1"></a>$\text{percent change} = \frac{B - A}{A} \times 100$.</span>
+<span id="cb108-954"><a href="#cb108-954" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-957"><a href="#cb108-957" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-958"><a href="#cb108-958" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-959"><a href="#cb108-959" aria-hidden="true" tabindex="-1"></a>incidence_2020 <span class="op">=</span> tb_census_df.loc[<span class="st">'Total'</span>, <span class="st">'recompute incidence 2020'</span>]</span>
+<span id="cb108-960"><a href="#cb108-960" aria-hidden="true" tabindex="-1"></a>incidence_2020</span>
+<span id="cb108-961"><a href="#cb108-961" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-962"><a href="#cb108-962" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-965"><a href="#cb108-965" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-966"><a href="#cb108-966" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-967"><a href="#cb108-967" aria-hidden="true" tabindex="-1"></a>incidence_2021 <span class="op">=</span> tb_census_df.loc[<span class="st">'Total'</span>, <span class="st">'recompute incidence 2021'</span>]</span>
+<span id="cb108-968"><a href="#cb108-968" aria-hidden="true" tabindex="-1"></a>incidence_2021</span>
+<span id="cb108-969"><a href="#cb108-969" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-970"><a href="#cb108-970" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-973"><a href="#cb108-973" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-974"><a href="#cb108-974" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-975"><a href="#cb108-975" aria-hidden="true" tabindex="-1"></a>difference <span class="op">=</span> (incidence_2021 <span class="op">-</span> incidence_2020)<span class="op">/</span>incidence_2020 <span class="op">*</span> <span class="dv">100</span></span>
+<span id="cb108-976"><a href="#cb108-976" aria-hidden="true" tabindex="-1"></a>difference</span>
+<span id="cb108-977"><a href="#cb108-977" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-978"><a href="#cb108-978" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-979"><a href="#cb108-979" aria-hidden="true" tabindex="-1"></a><span class="fu">## EDA Demo 2: Mauna Loa CO&lt;sub&gt;2&lt;/sub&gt; Data -- A Lesson in Data Faithfulness</span></span>
+<span id="cb108-980"><a href="#cb108-980" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-981"><a href="#cb108-981" aria-hidden="true" tabindex="-1"></a><span class="co">[</span><span class="ot">Mauna Loa Observatory</span><span class="co">](https://gml.noaa.gov/ccgg/trends/data.html)</span> has been monitoring CO&lt;sub&gt;2&lt;/sub&gt; concentrations since 1958.</span>
+<span id="cb108-982"><a href="#cb108-982" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-985"><a href="#cb108-985" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-986"><a href="#cb108-986" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-987"><a href="#cb108-987" aria-hidden="true" tabindex="-1"></a>co2_file <span class="op">=</span> <span class="st">"data/co2_mm_mlo.txt"</span></span>
+<span id="cb108-988"><a href="#cb108-988" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-989"><a href="#cb108-989" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-990"><a href="#cb108-990" aria-hidden="true" tabindex="-1"></a>Let's do some **EDA**!!</span>
+<span id="cb108-991"><a href="#cb108-991" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-992"><a href="#cb108-992" aria-hidden="true" tabindex="-1"></a><span class="fu">### Reading this file into `Pandas`?</span></span>
+<span id="cb108-993"><a href="#cb108-993" aria-hidden="true" tabindex="-1"></a>Let's instead check out this <span class="in">`.txt`</span> file. Some questions to keep in mind: Do we trust this file extension? What structure is it? </span>
+<span id="cb108-994"><a href="#cb108-994" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-995"><a href="#cb108-995" aria-hidden="true" tabindex="-1"></a>Lines 71-78 (inclusive) are shown below: </span>
+<span id="cb108-996"><a href="#cb108-996" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-997"><a href="#cb108-997" aria-hidden="true" tabindex="-1"></a><span class="in">    line number |                            file contents</span></span>
+<span id="cb108-998"><a href="#cb108-998" aria-hidden="true" tabindex="-1"></a><span class="in">    </span></span>
+<span id="cb108-999"><a href="#cb108-999" aria-hidden="true" tabindex="-1"></a><span class="in">    71          |   #            decimal     average   interpolated    trend    #days</span></span>
+<span id="cb108-1000"><a href="#cb108-1000" aria-hidden="true" tabindex="-1"></a><span class="in">    72          |   #             date                             (season corr)</span></span>
+<span id="cb108-1001"><a href="#cb108-1001" aria-hidden="true" tabindex="-1"></a><span class="in">    73          |   1958   3    1958.208      315.71      315.71      314.62     -1</span></span>
+<span id="cb108-1002"><a href="#cb108-1002" aria-hidden="true" tabindex="-1"></a><span class="in">    74          |   1958   4    1958.292      317.45      317.45      315.29     -1</span></span>
+<span id="cb108-1003"><a href="#cb108-1003" aria-hidden="true" tabindex="-1"></a><span class="in">    75          |   1958   5    1958.375      317.50      317.50      314.71     -1</span></span>
+<span id="cb108-1004"><a href="#cb108-1004" aria-hidden="true" tabindex="-1"></a><span class="in">    76          |   1958   6    1958.458      -99.99      317.10      314.85     -1</span></span>
+<span id="cb108-1005"><a href="#cb108-1005" aria-hidden="true" tabindex="-1"></a><span class="in">    77          |   1958   7    1958.542      315.86      315.86      314.98     -1</span></span>
+<span id="cb108-1006"><a href="#cb108-1006" aria-hidden="true" tabindex="-1"></a><span class="in">    78          |   1958   8    1958.625      314.93      314.93      315.94     -1</span></span>
+<span id="cb108-1007"><a href="#cb108-1007" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1008"><a href="#cb108-1008" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1009"><a href="#cb108-1009" aria-hidden="true" tabindex="-1"></a>Notice how: </span>
+<span id="cb108-1010"><a href="#cb108-1010" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1011"><a href="#cb108-1011" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The values are separated by white space, possibly tabs.</span>
+<span id="cb108-1012"><a href="#cb108-1012" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The data line up down the rows. For example, the month appears in 7th to 8th position of each line.</span>
+<span id="cb108-1013"><a href="#cb108-1013" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The 71st and 72nd lines in the file contain column headings split over two lines.</span>
+<span id="cb108-1014"><a href="#cb108-1014" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1015"><a href="#cb108-1015" aria-hidden="true" tabindex="-1"></a>We can use&nbsp;<span class="in">`read_csv`</span>&nbsp;to read the data into a <span class="in">`pandas`</span> <span class="in">`DataFrame`</span>, and we provide several arguments to specify that the separators are white space, there is no header (**we will set our own column names**), and to skip the first 72 rows of the file.</span>
+<span id="cb108-1016"><a href="#cb108-1016" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1019"><a href="#cb108-1019" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1020"><a href="#cb108-1020" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1021"><a href="#cb108-1021" aria-hidden="true" tabindex="-1"></a>co2 <span class="op">=</span> pd.read_csv(</span>
+<span id="cb108-1022"><a href="#cb108-1022" aria-hidden="true" tabindex="-1"></a>    co2_file, header <span class="op">=</span> <span class="va">None</span>, skiprows <span class="op">=</span> <span class="dv">72</span>,</span>
+<span id="cb108-1023"><a href="#cb108-1023" aria-hidden="true" tabindex="-1"></a>    sep <span class="op">=</span> <span class="vs">r'\s+'</span>       <span class="co">#delimiter for continuous whitespace (stay tuned for regex next lecture))</span></span>
+<span id="cb108-1024"><a href="#cb108-1024" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb108-1025"><a href="#cb108-1025" aria-hidden="true" tabindex="-1"></a>co2.head()</span>
+<span id="cb108-1026"><a href="#cb108-1026" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1027"><a href="#cb108-1027" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1028"><a href="#cb108-1028" aria-hidden="true" tabindex="-1"></a>Congratulations! You've wrangled the data!</span>
+<span id="cb108-1029"><a href="#cb108-1029" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1030"><a href="#cb108-1030" aria-hidden="true" tabindex="-1"></a>&lt;br/&gt;</span>
+<span id="cb108-1031"><a href="#cb108-1031" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1032"><a href="#cb108-1032" aria-hidden="true" tabindex="-1"></a>...But our columns aren't named.</span>
+<span id="cb108-1033"><a href="#cb108-1033" aria-hidden="true" tabindex="-1"></a>**We need to do more EDA.**</span>
+<span id="cb108-1034"><a href="#cb108-1034" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1035"><a href="#cb108-1035" aria-hidden="true" tabindex="-1"></a><span class="fu">### Exploring Variable Feature Types</span></span>
+<span id="cb108-1036"><a href="#cb108-1036" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1037"><a href="#cb108-1037" aria-hidden="true" tabindex="-1"></a>The NOAA <span class="co">[</span><span class="ot">webpage</span><span class="co">](https://gml.noaa.gov/ccgg/trends/)</span> might have some useful tidbits (in this case it doesn't).</span>
+<span id="cb108-1038"><a href="#cb108-1038" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1039"><a href="#cb108-1039" aria-hidden="true" tabindex="-1"></a>Using this information, we'll rerun <span class="in">`pd.read_csv`</span>, but this time with some **custom column names.**</span>
+<span id="cb108-1040"><a href="#cb108-1040" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1043"><a href="#cb108-1043" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1044"><a href="#cb108-1044" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1045"><a href="#cb108-1045" aria-hidden="true" tabindex="-1"></a>co2 <span class="op">=</span> pd.read_csv(</span>
+<span id="cb108-1046"><a href="#cb108-1046" aria-hidden="true" tabindex="-1"></a>    co2_file, header <span class="op">=</span> <span class="va">None</span>, skiprows <span class="op">=</span> <span class="dv">72</span>,</span>
+<span id="cb108-1047"><a href="#cb108-1047" aria-hidden="true" tabindex="-1"></a>    sep <span class="op">=</span> <span class="st">'\s+'</span>, <span class="co">#regex for continuous whitespace (next lecture)</span></span>
+<span id="cb108-1048"><a href="#cb108-1048" aria-hidden="true" tabindex="-1"></a>    names <span class="op">=</span> [<span class="st">'Yr'</span>, <span class="st">'Mo'</span>, <span class="st">'DecDate'</span>, <span class="st">'Avg'</span>, <span class="st">'Int'</span>, <span class="st">'Trend'</span>, <span class="st">'Days'</span>]</span>
+<span id="cb108-1049"><a href="#cb108-1049" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb108-1050"><a href="#cb108-1050" aria-hidden="true" tabindex="-1"></a>co2.head()</span>
+<span id="cb108-1051"><a href="#cb108-1051" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1052"><a href="#cb108-1052" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1053"><a href="#cb108-1053" aria-hidden="true" tabindex="-1"></a><span class="fu">### Visualizing CO&lt;sub&gt;2&lt;/sub&gt;</span></span>
+<span id="cb108-1054"><a href="#cb108-1054" aria-hidden="true" tabindex="-1"></a>Scientific studies tend to have very clean data, right...? Let's jump right in and make a time series plot of CO&lt;sub&gt;2&lt;/sub&gt; monthly averages.</span>
+<span id="cb108-1055"><a href="#cb108-1055" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1058"><a href="#cb108-1058" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1059"><a href="#cb108-1059" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-1060"><a href="#cb108-1060" aria-hidden="true" tabindex="-1"></a>sns.lineplot(x<span class="op">=</span><span class="st">'DecDate'</span>, y<span class="op">=</span><span class="st">'Avg'</span>, data<span class="op">=</span>co2)<span class="op">;</span></span>
+<span id="cb108-1061"><a href="#cb108-1061" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1062"><a href="#cb108-1062" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1063"><a href="#cb108-1063" aria-hidden="true" tabindex="-1"></a>The code above uses the <span class="in">`seaborn`</span> plotting library (abbreviated <span class="in">`sns`</span>). We will cover this in the Visualization lecture, but now you don't need to worry about how it works!</span>
+<span id="cb108-1064"><a href="#cb108-1064" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1065"><a href="#cb108-1065" aria-hidden="true" tabindex="-1"></a>Yikes! Plotting the data uncovered a problem. The sharp vertical lines suggest that we have some **missing values**. What happened here?</span>
+<span id="cb108-1066"><a href="#cb108-1066" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1069"><a href="#cb108-1069" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1070"><a href="#cb108-1070" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1071"><a href="#cb108-1071" aria-hidden="true" tabindex="-1"></a>co2.head()</span>
+<span id="cb108-1072"><a href="#cb108-1072" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1073"><a href="#cb108-1073" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1076"><a href="#cb108-1076" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1077"><a href="#cb108-1077" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1078"><a href="#cb108-1078" aria-hidden="true" tabindex="-1"></a>co2.tail()</span>
+<span id="cb108-1079"><a href="#cb108-1079" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1080"><a href="#cb108-1080" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1081"><a href="#cb108-1081" aria-hidden="true" tabindex="-1"></a>Some data have unusual values like -1 and -99.99.</span>
+<span id="cb108-1082"><a href="#cb108-1082" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1083"><a href="#cb108-1083" aria-hidden="true" tabindex="-1"></a>Let's check the description at the top of the file again.</span>
+<span id="cb108-1084"><a href="#cb108-1084" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1085"><a href="#cb108-1085" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>-1 signifies a missing value for the number of days <span class="in">`Days`</span> the equipment was in operation that month.</span>
+<span id="cb108-1086"><a href="#cb108-1086" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>-99.99 denotes a missing monthly average <span class="in">`Avg`</span></span>
+<span id="cb108-1087"><a href="#cb108-1087" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1088"><a href="#cb108-1088" aria-hidden="true" tabindex="-1"></a>How can we fix this? First, let's explore other aspects of our data. Understanding our data will help us decide what to do with the missing values.</span>
+<span id="cb108-1089"><a href="#cb108-1089" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1090"><a href="#cb108-1090" aria-hidden="true" tabindex="-1"></a>&lt;br/&gt;</span>
+<span id="cb108-1091"><a href="#cb108-1091" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1092"><a href="#cb108-1092" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1093"><a href="#cb108-1093" aria-hidden="true" tabindex="-1"></a><span class="fu">### Sanity Checks: Reasoning about the data</span></span>
+<span id="cb108-1094"><a href="#cb108-1094" aria-hidden="true" tabindex="-1"></a>First, we consider the shape of the data. How many rows should we have?</span>
+<span id="cb108-1095"><a href="#cb108-1095" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1096"><a href="#cb108-1096" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>If chronological order, we should have one record per month.</span>
+<span id="cb108-1097"><a href="#cb108-1097" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Data from March 1958 to August 2019.</span>
+<span id="cb108-1098"><a href="#cb108-1098" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>We should have $ 12 \times (2019-1957) - 2 - 4 = 738 $ records.</span>
+<span id="cb108-1099"><a href="#cb108-1099" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1102"><a href="#cb108-1102" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1103"><a href="#cb108-1103" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1104"><a href="#cb108-1104" aria-hidden="true" tabindex="-1"></a>co2.shape</span>
+<span id="cb108-1105"><a href="#cb108-1105" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1106"><a href="#cb108-1106" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1107"><a href="#cb108-1107" aria-hidden="true" tabindex="-1"></a>Nice!! The number of rows (i.e. records) match our expectations.</span>
+<span id="cb108-1108"><a href="#cb108-1108" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1109"><a href="#cb108-1109" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1110"><a href="#cb108-1110" aria-hidden="true" tabindex="-1"></a>Let's now check the quality of each feature.</span>
+<span id="cb108-1111"><a href="#cb108-1111" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1112"><a href="#cb108-1112" aria-hidden="true" tabindex="-1"></a><span class="fu">### Understanding Missing Value 1: `Days`</span></span>
+<span id="cb108-1113"><a href="#cb108-1113" aria-hidden="true" tabindex="-1"></a><span class="in">`Days`</span> is a time field, so let's analyze other time fields to see if there is an explanation for missing values of days of operation.</span>
+<span id="cb108-1114"><a href="#cb108-1114" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1115"><a href="#cb108-1115" aria-hidden="true" tabindex="-1"></a>Let's start with **months**, <span class="in">`Mo`</span>.</span>
+<span id="cb108-1116"><a href="#cb108-1116" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1117"><a href="#cb108-1117" aria-hidden="true" tabindex="-1"></a>Are we missing any records? The number of months should have 62 or 61 instances (March 1957-August 2019).</span>
+<span id="cb108-1118"><a href="#cb108-1118" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1121"><a href="#cb108-1121" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1122"><a href="#cb108-1122" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1123"><a href="#cb108-1123" aria-hidden="true" tabindex="-1"></a>co2[<span class="st">"Mo"</span>].value_counts().sort_index()</span>
+<span id="cb108-1124"><a href="#cb108-1124" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1125"><a href="#cb108-1125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1126"><a href="#cb108-1126" aria-hidden="true" tabindex="-1"></a>As expected Jan, Feb, Sep, Oct, Nov, and Dec have 61 occurrences and the rest 62.</span>
+<span id="cb108-1127"><a href="#cb108-1127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1128"><a href="#cb108-1128" aria-hidden="true" tabindex="-1"></a>&lt;br/&gt;</span>
+<span id="cb108-1129"><a href="#cb108-1129" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1130"><a href="#cb108-1130" aria-hidden="true" tabindex="-1"></a>Next let's explore **days** <span class="in">`Days`</span> itself, which is the number of days that the measurement equipment worked.</span>
+<span id="cb108-1131"><a href="#cb108-1131" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1134"><a href="#cb108-1134" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1135"><a href="#cb108-1135" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-1136"><a href="#cb108-1136" aria-hidden="true" tabindex="-1"></a>sns.displot(co2[<span class="st">'Days'</span>])<span class="op">;</span></span>
+<span id="cb108-1137"><a href="#cb108-1137" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Distribution of days feature"</span>)<span class="op">;</span> <span class="co"># suppresses unneeded plotting output</span></span>
+<span id="cb108-1138"><a href="#cb108-1138" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1139"><a href="#cb108-1139" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1140"><a href="#cb108-1140" aria-hidden="true" tabindex="-1"></a>In terms of data quality, a handful of months have averages based on measurements taken on fewer than half the days. In addition, there are nearly 200 missing values--**that's about 27% of the data**!</span>
+<span id="cb108-1141"><a href="#cb108-1141" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1142"><a href="#cb108-1142" aria-hidden="true" tabindex="-1"></a>&lt;br/&gt;</span>
+<span id="cb108-1143"><a href="#cb108-1143" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1144"><a href="#cb108-1144" aria-hidden="true" tabindex="-1"></a>Finally, let's check the last time feature, **year** <span class="in">`Yr`</span>.</span>
+<span id="cb108-1145"><a href="#cb108-1145" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1146"><a href="#cb108-1146" aria-hidden="true" tabindex="-1"></a>Let's check to see if there is any connection between missing-ness and the year of the recording.</span>
+<span id="cb108-1147"><a href="#cb108-1147" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1150"><a href="#cb108-1150" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1151"><a href="#cb108-1151" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-1152"><a href="#cb108-1152" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(x<span class="op">=</span><span class="st">"Yr"</span>, y<span class="op">=</span><span class="st">"Days"</span>, data<span class="op">=</span>co2)<span class="op">;</span></span>
+<span id="cb108-1153"><a href="#cb108-1153" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Day field by Year"</span>)<span class="op">;</span> <span class="co"># the ; suppresses output</span></span>
+<span id="cb108-1154"><a href="#cb108-1154" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1155"><a href="#cb108-1155" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1156"><a href="#cb108-1156" aria-hidden="true" tabindex="-1"></a>**Observations**:</span>
+<span id="cb108-1157"><a href="#cb108-1157" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1158"><a href="#cb108-1158" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>All of the missing data are in the early years of operation.</span>
+<span id="cb108-1159"><a href="#cb108-1159" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>It appears there may have been problems with equipment in the mid to late 80s.</span>
+<span id="cb108-1160"><a href="#cb108-1160" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1161"><a href="#cb108-1161" aria-hidden="true" tabindex="-1"></a>**Potential Next Steps**:</span>
+<span id="cb108-1162"><a href="#cb108-1162" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1163"><a href="#cb108-1163" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Confirm these explanations through documentation about the historical readings.</span>
+<span id="cb108-1164"><a href="#cb108-1164" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Maybe drop the earliest recordings? However, we would want to delay such action until after we have examined the time trends and assess whether there are any potential problems.</span>
+<span id="cb108-1165"><a href="#cb108-1165" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1166"><a href="#cb108-1166" aria-hidden="true" tabindex="-1"></a>&lt;br/&gt;</span>
+<span id="cb108-1167"><a href="#cb108-1167" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1168"><a href="#cb108-1168" aria-hidden="true" tabindex="-1"></a><span class="fu">### Understanding Missing Value 2: `Avg`</span></span>
+<span id="cb108-1169"><a href="#cb108-1169" aria-hidden="true" tabindex="-1"></a>Next, let's return to the -99.99 values in <span class="in">`Avg`</span> to analyze the overall quality of the CO&lt;sub&gt;2&lt;/sub&gt; measurements. We'll plot a histogram of the average CO&lt;sub&gt;2&lt;/sub&gt; measurements</span>
+<span id="cb108-1170"><a href="#cb108-1170" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1173"><a href="#cb108-1173" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1174"><a href="#cb108-1174" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-1175"><a href="#cb108-1175" aria-hidden="true" tabindex="-1"></a><span class="co"># Histograms of average CO2 measurements</span></span>
+<span id="cb108-1176"><a href="#cb108-1176" aria-hidden="true" tabindex="-1"></a>sns.displot(co2[<span class="st">'Avg'</span>])<span class="op">;</span></span>
+<span id="cb108-1177"><a href="#cb108-1177" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1178"><a href="#cb108-1178" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1179"><a href="#cb108-1179" aria-hidden="true" tabindex="-1"></a>The non-missing values are in the 300-400 range (a regular range of CO&lt;sub&gt;2&lt;/sub&gt; levels).</span>
+<span id="cb108-1180"><a href="#cb108-1180" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1181"><a href="#cb108-1181" aria-hidden="true" tabindex="-1"></a>We also see that there are only a few missing <span class="in">`Avg`</span> values (**&lt;1% of values**). Let's examine all of them:</span>
+<span id="cb108-1182"><a href="#cb108-1182" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1185"><a href="#cb108-1185" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1186"><a href="#cb108-1186" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1187"><a href="#cb108-1187" aria-hidden="true" tabindex="-1"></a>co2[co2[<span class="st">"Avg"</span>] <span class="op">&lt;</span> <span class="dv">0</span>]</span>
+<span id="cb108-1188"><a href="#cb108-1188" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1189"><a href="#cb108-1189" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1190"><a href="#cb108-1190" aria-hidden="true" tabindex="-1"></a>There doesn't seem to be a pattern to these values, other than that most records also were missing <span class="in">`Days`</span> data.</span>
+<span id="cb108-1191"><a href="#cb108-1191" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1192"><a href="#cb108-1192" aria-hidden="true" tabindex="-1"></a><span class="fu">### Drop, `NaN`, or Impute Missing `Avg` Data?</span></span>
+<span id="cb108-1193"><a href="#cb108-1193" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1194"><a href="#cb108-1194" aria-hidden="true" tabindex="-1"></a>How should we address the invalid <span class="in">`Avg`</span> data?</span>
+<span id="cb108-1195"><a href="#cb108-1195" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1196"><a href="#cb108-1196" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Drop records</span>
+<span id="cb108-1197"><a href="#cb108-1197" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Set to NaN</span>
+<span id="cb108-1198"><a href="#cb108-1198" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Impute using some strategy</span>
+<span id="cb108-1199"><a href="#cb108-1199" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1200"><a href="#cb108-1200" aria-hidden="true" tabindex="-1"></a>Remember we want to fix the following plot:</span>
+<span id="cb108-1201"><a href="#cb108-1201" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1204"><a href="#cb108-1204" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1205"><a href="#cb108-1205" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-1206"><a href="#cb108-1206" aria-hidden="true" tabindex="-1"></a>sns.lineplot(x<span class="op">=</span><span class="st">'DecDate'</span>, y<span class="op">=</span><span class="st">'Avg'</span>, data<span class="op">=</span>co2)</span>
+<span id="cb108-1207"><a href="#cb108-1207" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"CO2 Average By Month"</span>)<span class="op">;</span></span>
+<span id="cb108-1208"><a href="#cb108-1208" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1209"><a href="#cb108-1209" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1210"><a href="#cb108-1210" aria-hidden="true" tabindex="-1"></a>Since we are plotting <span class="in">`Avg`</span> vs <span class="in">`DecDate`</span>, we should just focus on dealing with missing values for <span class="in">`Avg`</span>.</span>
+<span id="cb108-1211"><a href="#cb108-1211" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1212"><a href="#cb108-1212" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1213"><a href="#cb108-1213" aria-hidden="true" tabindex="-1"></a>Let's consider a few options:</span>
+<span id="cb108-1214"><a href="#cb108-1214" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Drop those records</span>
+<span id="cb108-1215"><a href="#cb108-1215" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Replace -99.99 with NaN</span>
+<span id="cb108-1216"><a href="#cb108-1216" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Substitute it with a likely value for the average CO&lt;sub&gt;2&lt;/sub&gt;?</span>
+<span id="cb108-1217"><a href="#cb108-1217" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1218"><a href="#cb108-1218" aria-hidden="true" tabindex="-1"></a>What do you think are the pros and cons of each possible action?</span>
+<span id="cb108-1219"><a href="#cb108-1219" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1220"><a href="#cb108-1220" aria-hidden="true" tabindex="-1"></a>Let's examine each of these three options.</span>
+<span id="cb108-1221"><a href="#cb108-1221" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1224"><a href="#cb108-1224" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1225"><a href="#cb108-1225" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1226"><a href="#cb108-1226" aria-hidden="true" tabindex="-1"></a><span class="co"># 1. Drop missing values</span></span>
+<span id="cb108-1227"><a href="#cb108-1227" aria-hidden="true" tabindex="-1"></a>co2_drop <span class="op">=</span> co2[co2[<span class="st">'Avg'</span>] <span class="op">&gt;</span> <span class="dv">0</span>]</span>
+<span id="cb108-1228"><a href="#cb108-1228" aria-hidden="true" tabindex="-1"></a>co2_drop.head()</span>
+<span id="cb108-1229"><a href="#cb108-1229" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1230"><a href="#cb108-1230" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1233"><a href="#cb108-1233" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1234"><a href="#cb108-1234" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1235"><a href="#cb108-1235" aria-hidden="true" tabindex="-1"></a><span class="co"># 2. Replace NaN with -99.99</span></span>
+<span id="cb108-1236"><a href="#cb108-1236" aria-hidden="true" tabindex="-1"></a>co2_NA <span class="op">=</span> co2.replace(<span class="op">-</span><span class="fl">99.99</span>, np.nan)</span>
+<span id="cb108-1237"><a href="#cb108-1237" aria-hidden="true" tabindex="-1"></a>co2_NA.head()</span>
+<span id="cb108-1238"><a href="#cb108-1238" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1239"><a href="#cb108-1239" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1240"><a href="#cb108-1240" aria-hidden="true" tabindex="-1"></a>We'll also use a third version of the data.</span>
+<span id="cb108-1241"><a href="#cb108-1241" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1242"><a href="#cb108-1242" aria-hidden="true" tabindex="-1"></a>First, we note that the dataset already comes with a **substitute value** for the -99.99.</span>
+<span id="cb108-1243"><a href="#cb108-1243" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1244"><a href="#cb108-1244" aria-hidden="true" tabindex="-1"></a>From the file description:</span>
+<span id="cb108-1245"><a href="#cb108-1245" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1246"><a href="#cb108-1246" aria-hidden="true" tabindex="-1"></a><span class="at">&gt;  The </span><span class="in">`interpolated`</span><span class="at"> column includes average values from the preceding column (</span><span class="in">`average`</span><span class="at">)</span></span>
+<span id="cb108-1247"><a href="#cb108-1247" aria-hidden="true" tabindex="-1"></a><span class="at">and **interpolated values** where data are missing.  Interpolated values are</span></span>
+<span id="cb108-1248"><a href="#cb108-1248" aria-hidden="true" tabindex="-1"></a><span class="at">computed in two steps...</span></span>
+<span id="cb108-1249"><a href="#cb108-1249" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1250"><a href="#cb108-1250" aria-hidden="true" tabindex="-1"></a>The <span class="in">`Int`</span> feature has values that exactly match those in <span class="in">`Avg`</span>, except when <span class="in">`Avg`</span> is -99.99, and then a **reasonable** estimate is used instead.</span>
+<span id="cb108-1251"><a href="#cb108-1251" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1252"><a href="#cb108-1252" aria-hidden="true" tabindex="-1"></a>So, the third version of our data will use the <span class="in">`Int`</span> feature instead of <span class="in">`Avg`</span>.</span>
+<span id="cb108-1253"><a href="#cb108-1253" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1256"><a href="#cb108-1256" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1257"><a href="#cb108-1257" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb108-1258"><a href="#cb108-1258" aria-hidden="true" tabindex="-1"></a><span class="co"># 3. Use interpolated column which estimates missing Avg values</span></span>
+<span id="cb108-1259"><a href="#cb108-1259" aria-hidden="true" tabindex="-1"></a>co2_impute <span class="op">=</span> co2.copy()</span>
+<span id="cb108-1260"><a href="#cb108-1260" aria-hidden="true" tabindex="-1"></a>co2_impute[<span class="st">'Avg'</span>] <span class="op">=</span> co2[<span class="st">'Int'</span>]</span>
+<span id="cb108-1261"><a href="#cb108-1261" aria-hidden="true" tabindex="-1"></a>co2_impute.head()</span>
+<span id="cb108-1262"><a href="#cb108-1262" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1263"><a href="#cb108-1263" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1264"><a href="#cb108-1264" aria-hidden="true" tabindex="-1"></a>What's a **reasonable** estimate?</span>
+<span id="cb108-1265"><a href="#cb108-1265" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1266"><a href="#cb108-1266" aria-hidden="true" tabindex="-1"></a>To answer this question, let's zoom in on a short time period, say the measurements in 1958 (where we know we have two missing values).</span>
+<span id="cb108-1267"><a href="#cb108-1267" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1270"><a href="#cb108-1270" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1271"><a href="#cb108-1271" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-1272"><a href="#cb108-1272" aria-hidden="true" tabindex="-1"></a><span class="co"># results of plotting data in 1958</span></span>
+<span id="cb108-1273"><a href="#cb108-1273" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1274"><a href="#cb108-1274" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> line_and_points(data, ax, title):</span>
+<span id="cb108-1275"><a href="#cb108-1275" aria-hidden="true" tabindex="-1"></a>    <span class="co"># assumes single year, hence Mo</span></span>
+<span id="cb108-1276"><a href="#cb108-1276" aria-hidden="true" tabindex="-1"></a>    ax.plot(<span class="st">'Mo'</span>, <span class="st">'Avg'</span>, data<span class="op">=</span>data)</span>
+<span id="cb108-1277"><a href="#cb108-1277" aria-hidden="true" tabindex="-1"></a>    ax.scatter(<span class="st">'Mo'</span>, <span class="st">'Avg'</span>, data<span class="op">=</span>data)</span>
+<span id="cb108-1278"><a href="#cb108-1278" aria-hidden="true" tabindex="-1"></a>    ax.set_xlim(<span class="dv">2</span>, <span class="dv">13</span>)</span>
+<span id="cb108-1279"><a href="#cb108-1279" aria-hidden="true" tabindex="-1"></a>    ax.set_title(title)</span>
+<span id="cb108-1280"><a href="#cb108-1280" aria-hidden="true" tabindex="-1"></a>    ax.set_xticks(np.arange(<span class="dv">3</span>, <span class="dv">13</span>))</span>
+<span id="cb108-1281"><a href="#cb108-1281" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1282"><a href="#cb108-1282" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> data_year(data, year):</span>
+<span id="cb108-1283"><a href="#cb108-1283" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> data[data[<span class="st">"Yr"</span>] <span class="op">==</span> <span class="dv">1958</span>]</span>
+<span id="cb108-1284"><a href="#cb108-1284" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb108-1285"><a href="#cb108-1285" aria-hidden="true" tabindex="-1"></a><span class="co"># uses matplotlib subplots</span></span>
+<span id="cb108-1286"><a href="#cb108-1286" aria-hidden="true" tabindex="-1"></a><span class="co"># you may see more next week; focus on output for now</span></span>
+<span id="cb108-1287"><a href="#cb108-1287" aria-hidden="true" tabindex="-1"></a>fig, axes <span class="op">=</span> plt.subplots(ncols <span class="op">=</span> <span class="dv">3</span>, figsize<span class="op">=</span>(<span class="dv">12</span>, <span class="dv">4</span>), sharey<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb108-1288"><a href="#cb108-1288" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1289"><a href="#cb108-1289" aria-hidden="true" tabindex="-1"></a>year <span class="op">=</span> <span class="dv">1958</span></span>
+<span id="cb108-1290"><a href="#cb108-1290" aria-hidden="true" tabindex="-1"></a>line_and_points(data_year(co2_drop, year), axes[<span class="dv">0</span>], title<span class="op">=</span><span class="st">"1. Drop Missing"</span>)</span>
+<span id="cb108-1291"><a href="#cb108-1291" aria-hidden="true" tabindex="-1"></a>line_and_points(data_year(co2_NA, year), axes[<span class="dv">1</span>], title<span class="op">=</span><span class="st">"2. Missing Set to NaN"</span>)</span>
+<span id="cb108-1292"><a href="#cb108-1292" aria-hidden="true" tabindex="-1"></a>line_and_points(data_year(co2_impute, year), axes[<span class="dv">2</span>], title<span class="op">=</span><span class="st">"3. Missing Interpolated"</span>)</span>
+<span id="cb108-1293"><a href="#cb108-1293" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1294"><a href="#cb108-1294" aria-hidden="true" tabindex="-1"></a>fig.suptitle(<span class="ss">f"Monthly Averages for </span><span class="sc">{</span>year<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb108-1295"><a href="#cb108-1295" aria-hidden="true" tabindex="-1"></a>plt.tight_layout()</span>
+<span id="cb108-1296"><a href="#cb108-1296" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1297"><a href="#cb108-1297" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1298"><a href="#cb108-1298" aria-hidden="true" tabindex="-1"></a>In the big picture since there are only 7 <span class="in">`Avg`</span> values missing (**&lt;1%** of 738 months), any of these approaches would work.</span>
+<span id="cb108-1299"><a href="#cb108-1299" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1300"><a href="#cb108-1300" aria-hidden="true" tabindex="-1"></a>However there is some appeal to **option C, Imputing**:</span>
+<span id="cb108-1301"><a href="#cb108-1301" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1302"><a href="#cb108-1302" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Shows seasonal trends for CO&lt;sub&gt;2&lt;/sub&gt;</span>
+<span id="cb108-1303"><a href="#cb108-1303" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>We are plotting all months in our data as a line plot</span>
+<span id="cb108-1304"><a href="#cb108-1304" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1305"><a href="#cb108-1305" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1306"><a href="#cb108-1306" aria-hidden="true" tabindex="-1"></a>Let's replot our original figure with option 3:</span>
+<span id="cb108-1307"><a href="#cb108-1307" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1310"><a href="#cb108-1310" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1311"><a href="#cb108-1311" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-1312"><a href="#cb108-1312" aria-hidden="true" tabindex="-1"></a>sns.lineplot(x<span class="op">=</span><span class="st">'DecDate'</span>, y<span class="op">=</span><span class="st">'Avg'</span>, data<span class="op">=</span>co2_impute)</span>
+<span id="cb108-1313"><a href="#cb108-1313" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"CO2 Average By Month, Imputed"</span>)<span class="op">;</span></span>
+<span id="cb108-1314"><a href="#cb108-1314" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1315"><a href="#cb108-1315" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1316"><a href="#cb108-1316" aria-hidden="true" tabindex="-1"></a>Looks pretty close to what we see on the NOAA <span class="co">[</span><span class="ot">website</span><span class="co">](https://gml.noaa.gov/ccgg/trends/)</span>!</span>
+<span id="cb108-1317"><a href="#cb108-1317" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1318"><a href="#cb108-1318" aria-hidden="true" tabindex="-1"></a><span class="fu">### Presenting the Data: A Discussion on Data Granularity</span></span>
+<span id="cb108-1319"><a href="#cb108-1319" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1320"><a href="#cb108-1320" aria-hidden="true" tabindex="-1"></a>From the description:</span>
+<span id="cb108-1321"><a href="#cb108-1321" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1322"><a href="#cb108-1322" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Monthly measurements are averages of average day measurements.</span>
+<span id="cb108-1323"><a href="#cb108-1323" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>The NOAA GML website has datasets for daily/hourly measurements too.</span>
+<span id="cb108-1324"><a href="#cb108-1324" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1325"><a href="#cb108-1325" aria-hidden="true" tabindex="-1"></a>The data you present depends on your research question.</span>
+<span id="cb108-1326"><a href="#cb108-1326" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1327"><a href="#cb108-1327" aria-hidden="true" tabindex="-1"></a>**How do CO&lt;sub&gt;2&lt;/sub&gt; levels vary by season?**</span>
+<span id="cb108-1328"><a href="#cb108-1328" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1329"><a href="#cb108-1329" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>You might want to keep average monthly data.</span>
+<span id="cb108-1330"><a href="#cb108-1330" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1331"><a href="#cb108-1331" aria-hidden="true" tabindex="-1"></a>**Are CO&lt;sub&gt;2&lt;/sub&gt; levels rising over the past 50+ years, consistent with global warming predictions?**</span>
+<span id="cb108-1332"><a href="#cb108-1332" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1333"><a href="#cb108-1333" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>You might be happier with a **coarser granularity** of average year data!</span>
+<span id="cb108-1334"><a href="#cb108-1334" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1337"><a href="#cb108-1337" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb108-1338"><a href="#cb108-1338" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb108-1339"><a href="#cb108-1339" aria-hidden="true" tabindex="-1"></a>co2_year <span class="op">=</span> co2_impute.groupby(<span class="st">'Yr'</span>).mean()</span>
+<span id="cb108-1340"><a href="#cb108-1340" aria-hidden="true" tabindex="-1"></a>sns.lineplot(x<span class="op">=</span><span class="st">'Yr'</span>, y<span class="op">=</span><span class="st">'Avg'</span>, data<span class="op">=</span>co2_year)</span>
+<span id="cb108-1341"><a href="#cb108-1341" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"CO2 Average By Year"</span>)<span class="op">;</span></span>
+<span id="cb108-1342"><a href="#cb108-1342" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb108-1343"><a href="#cb108-1343" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1344"><a href="#cb108-1344" aria-hidden="true" tabindex="-1"></a>Indeed, we see a rise by nearly 100 ppm of CO&lt;sub&gt;2&lt;/sub&gt; since Mauna Loa began recording in 1958.</span>
+<span id="cb108-1345"><a href="#cb108-1345" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1346"><a href="#cb108-1346" aria-hidden="true" tabindex="-1"></a><span class="fu">## Summary</span></span>
+<span id="cb108-1347"><a href="#cb108-1347" aria-hidden="true" tabindex="-1"></a>We went over a lot of content this lecture; let's summarize the most important points: </span>
+<span id="cb108-1348"><a href="#cb108-1348" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1349"><a href="#cb108-1349" aria-hidden="true" tabindex="-1"></a><span class="fu">### Dealing with Missing Values</span></span>
+<span id="cb108-1350"><a href="#cb108-1350" aria-hidden="true" tabindex="-1"></a>There are a few options we can take to deal with missing data:</span>
+<span id="cb108-1351"><a href="#cb108-1351" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1352"><a href="#cb108-1352" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Drop missing records</span>
+<span id="cb108-1353"><a href="#cb108-1353" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Keep <span class="in">`NaN`</span> missing values</span>
+<span id="cb108-1354"><a href="#cb108-1354" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Impute using an interpolated column</span>
+<span id="cb108-1355"><a href="#cb108-1355" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1356"><a href="#cb108-1356" aria-hidden="true" tabindex="-1"></a><span class="fu">### EDA and Data Wrangling</span></span>
+<span id="cb108-1357"><a href="#cb108-1357" aria-hidden="true" tabindex="-1"></a>There are several ways to approach EDA and Data Wrangling: </span>
+<span id="cb108-1358"><a href="#cb108-1358" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb108-1359"><a href="#cb108-1359" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Examine the **data and metadata**: what is the date, size, organization, and structure of the data? </span>
+<span id="cb108-1360"><a href="#cb108-1360" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Examine each **field/attribute/dimension** individually.</span>
+<span id="cb108-1361"><a href="#cb108-1361" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Examine pairs of related dimensions (e.g. breaking down grades by major).</span>
+<span id="cb108-1362"><a href="#cb108-1362" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Along the way, we can:</span>
+<span id="cb108-1363"><a href="#cb108-1363" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>**Visualize** or summarize the data.</span>
+<span id="cb108-1364"><a href="#cb108-1364" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>**Validate assumptions** about data and its collection process. Pay particular attention to when the data was collected. </span>
+<span id="cb108-1365"><a href="#cb108-1365" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Identify and **address anomalies**.</span>
+<span id="cb108-1366"><a href="#cb108-1366" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Apply data transformations and corrections (we'll cover this in the upcoming lecture).</span>
+<span id="cb108-1367"><a href="#cb108-1367" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>**Record everything you do!** Developing in Jupyter Notebook promotes *reproducibility* of your own work!</span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/eda/eda_files/figure-html/cell-62-output-1.png b/docs/eda/eda_files/figure-html/cell-62-output-1.png
new file mode 100644
index 000000000..2e13ba75f
Binary files /dev/null and b/docs/eda/eda_files/figure-html/cell-62-output-1.png differ
diff --git a/docs/eda/eda_files/figure-html/cell-67-output-1.png b/docs/eda/eda_files/figure-html/cell-67-output-1.png
new file mode 100644
index 000000000..25ce5066b
Binary files /dev/null and b/docs/eda/eda_files/figure-html/cell-67-output-1.png differ
diff --git a/docs/eda/eda_files/figure-html/cell-68-output-1.png b/docs/eda/eda_files/figure-html/cell-68-output-1.png
new file mode 100644
index 000000000..87476da2f
Binary files /dev/null and b/docs/eda/eda_files/figure-html/cell-68-output-1.png differ
diff --git a/docs/eda/eda_files/figure-html/cell-69-output-1.png b/docs/eda/eda_files/figure-html/cell-69-output-1.png
new file mode 100644
index 000000000..e5de329e3
Binary files /dev/null and b/docs/eda/eda_files/figure-html/cell-69-output-1.png differ
diff --git a/docs/eda/eda_files/figure-html/cell-71-output-1.png b/docs/eda/eda_files/figure-html/cell-71-output-1.png
new file mode 100644
index 000000000..b61af206f
Binary files /dev/null and b/docs/eda/eda_files/figure-html/cell-71-output-1.png differ
diff --git a/docs/eda/eda_files/figure-html/cell-75-output-1.png b/docs/eda/eda_files/figure-html/cell-75-output-1.png
new file mode 100644
index 000000000..d7165c169
Binary files /dev/null and b/docs/eda/eda_files/figure-html/cell-75-output-1.png differ
diff --git a/docs/eda/eda_files/figure-html/cell-76-output-1.png b/docs/eda/eda_files/figure-html/cell-76-output-1.png
new file mode 100644
index 000000000..93f427235
Binary files /dev/null and b/docs/eda/eda_files/figure-html/cell-76-output-1.png differ
diff --git a/docs/eda/eda_files/figure-html/cell-77-output-1.png b/docs/eda/eda_files/figure-html/cell-77-output-1.png
new file mode 100644
index 000000000..da6803619
Binary files /dev/null and b/docs/eda/eda_files/figure-html/cell-77-output-1.png differ
diff --git a/docs/eda/images/variable.png b/docs/eda/images/variable.png
new file mode 100644
index 000000000..3cd730a94
Binary files /dev/null and b/docs/eda/images/variable.png differ
diff --git a/docs/feature_engineering/feature_engineering.html b/docs/feature_engineering/feature_engineering.html
new file mode 100644
index 000000000..9075eb71a
--- /dev/null
+++ b/docs/feature_engineering/feature_engineering.html
@@ -0,0 +1,1789 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>14&nbsp; Feature Engineering – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../case_study_HCE/case_study_HCE.html" rel="next">
+<link href="../gradient_descent/gradient_descent.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../feature_engineering/feature_engineering.html"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Feature Engineering</h2>
+   
+  <ul>
+  <li><a href="#gradient-descent-cont." id="toc-gradient-descent-cont." class="nav-link active" data-scroll-target="#gradient-descent-cont."><span class="header-section-number">14.1</span> Gradient Descent Cont.</a>
+  <ul>
+  <li><a href="#gradient-descent-review" id="toc-gradient-descent-review" class="nav-link" data-scroll-target="#gradient-descent-review"><span class="header-section-number">14.1.1</span> Gradient Descent Review</a></li>
+  <li><a href="#stochastic-mini-batch-gradient-descent" id="toc-stochastic-mini-batch-gradient-descent" class="nav-link" data-scroll-target="#stochastic-mini-batch-gradient-descent"><span class="header-section-number">14.1.2</span> Stochastic (Mini-batch) Gradient Descent</a></li>
+  </ul></li>
+  <li><a href="#feature-engineering" id="toc-feature-engineering" class="nav-link" data-scroll-target="#feature-engineering"><span class="header-section-number">14.2</span> Feature Engineering</a></li>
+  <li><a href="#feature-functions" id="toc-feature-functions" class="nav-link" data-scroll-target="#feature-functions"><span class="header-section-number">14.3</span> Feature Functions</a></li>
+  <li><a href="#one-hot-encoding" id="toc-one-hot-encoding" class="nav-link" data-scroll-target="#one-hot-encoding"><span class="header-section-number">14.4</span> One Hot Encoding</a></li>
+  <li><a href="#polynomial-features" id="toc-polynomial-features" class="nav-link" data-scroll-target="#polynomial-features"><span class="header-section-number">14.5</span> Polynomial Features</a></li>
+  <li><a href="#complexity-and-overfitting" id="toc-complexity-and-overfitting" class="nav-link" data-scroll-target="#complexity-and-overfitting"><span class="header-section-number">14.6</span> Complexity and Overfitting</a></li>
+  <li><a href="#bonus-stochastic-gradient-descent-in-pytorch" id="toc-bonus-stochastic-gradient-descent-in-pytorch" class="nav-link" data-scroll-target="#bonus-stochastic-gradient-descent-in-pytorch"><span class="header-section-number">14.7</span> [Bonus] Stochastic Gradient Descent in <code>PyTorch</code></a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Recognize the value of feature engineering as a tool to improve model performance</li>
+<li>Implement polynomial feature generation and one hot encoding</li>
+<li>Understand the interactions between model complexity, model variance, and training error</li>
+</ul>
+</div>
+</div>
+</div>
+<p>At this point, we’ve grown quite familiar with the modeling process. We’ve introduced the concept of loss, used it to fit several types of models, and, most recently, extended our analysis to multiple regression. Along the way, we’ve forged our way through the mathematics of deriving the optimal model parameters in all its gory detail. It’s time to make our lives a little easier – let’s implement the modeling process in code!</p>
+<p>In this lecture, we’ll explore two techniques for model fitting:</p>
+<ol type="1">
+<li>Translating our derived formulas for regression to <code>python</code></li>
+<li>Using <code>python</code>’s <code>sklearn</code> package</li>
+</ol>
+<p>With our new programming frameworks in hand, we will also add sophistication to our models by introducing more complex features to enhance model performance.</p>
+<section id="gradient-descent-cont." class="level2" data-number="14.1">
+<h2 data-number="14.1" class="anchored" data-anchor-id="gradient-descent-cont."><span class="header-section-number">14.1</span> Gradient Descent Cont.</h2>
+<p>Before we dive into feature engineering, let’s quickly review gradient descent, which we covered in the last lecture. Recall that gradient descent is a powerful technique for choosing the model parameters that minimize the loss function.</p>
+<section id="gradient-descent-review" class="level3" data-number="14.1.1">
+<h3 data-number="14.1.1" class="anchored" data-anchor-id="gradient-descent-review"><span class="header-section-number">14.1.1</span> Gradient Descent Review</h3>
+<p>As we learned earlier, we set the derivative of the loss function to zero and solve to determine the optimal parameters <span class="math inline">\(\theta\)</span> that minimize loss. For a loss surface in 2D (or higher), the best way to minimize loss is to “walk” down the loss surface until we reach our optimal parameters <span class="math inline">\(\vec{\theta}\)</span>. The <strong>gradient vector</strong> tells us which direction to “walk” in.</p>
+<p>For example, the <em>vector</em> of parameter values <span class="math inline">\(\vec{\theta} = \begin{bmatrix}
+           \theta_{0} \\
+           \theta_{1} \\
+         \end{bmatrix}\)</span> gives us a two parameter model (d = 2). To calculate our gradient vector, we can take the <em>partial derivative</em> of loss with respect to each parameter: <span class="math inline">\(\frac{\partial L}{\partial \theta_0}\)</span> and <span class="math inline">\(\frac{\partial L}{\partial \theta_1}\)</span>.</p>
+<p>Its <strong>gradient vector</strong> would then be the 2D vector: <span class="math display">\[\nabla_{\vec{\theta}} L =  \begin{bmatrix} \frac{\partial L}{\partial \theta_0} \\ \frac{\partial L}{\partial \theta_1} \end{bmatrix}\]</span></p>
+<p>Note that <span class="math inline">\(-\nabla_{\vec{\theta}} L\)</span> always points in the <strong>downhill direction</strong> of the surface.</p>
+<p>Recall that we also discussed the gradient descent update rule, where we nudge <span class="math inline">\(\theta\)</span> in a negative gradient direction until <span class="math inline">\(\theta\)</span> converges.</p>
+<p>As a refresher, the rule is as follows: <span class="math display">\[\vec{\theta}^{(t+1)} = \vec{\theta}^{(t)} - \alpha \nabla_{\vec{\theta}} L(\vec{\theta}^{(t)}) \]</span></p>
+<ul>
+<li><span class="math inline">\(\theta\)</span> is a vector with our model weights</li>
+<li><span class="math inline">\(L\)</span> is the loss function</li>
+<li><span class="math inline">\(\alpha\)</span> is the learning rate</li>
+<li><span class="math inline">\(\vec{\theta}^{(t)}\)</span> is the current value of <span class="math inline">\(\theta\)</span></li>
+<li><span class="math inline">\(\vec{\theta}^{(t+1)}\)</span> is the next value of <span class="math inline">\(\theta\)</span></li>
+<li><span class="math inline">\(\nabla_{\vec{\theta}} L(\vec{\theta}^{(t)})\)</span> is the gradient of the loss function evaluated at the current <span class="math inline">\(\theta\)</span>: <span class="math display">\[\frac{1}{n}\sum_{i=1}^{n}\nabla_{\vec{\theta}} l(y_i, f_{\vec{\theta}^{(t)}}(X_i))\]</span></li>
+</ul>
+<p>Let’s now walk through an example of calculating and updating the gradient vector. Say our model and loss are: <span class="math display">\[\begin{align}
+f_{\vec{\theta}}(\vec{x}) &amp;= \vec{x}^T\vec{\theta} = \theta_0x_0 + \theta_1x_1
+\\l(y, \hat{y}) &amp;= (y - \hat{y})^2
+\end{align}
+\]</span></p>
+<p>Plugging in <span class="math inline">\(f_{\vec{\theta}}(\vec{x})\)</span> for <span class="math inline">\(\hat{y}\)</span>, our loss function becomes <span class="math inline">\(l(\vec{\theta}, \vec{x}, y_i) = (y_i - \theta_0x_0 - \theta_1x_1)^2\)</span>.</p>
+<p>To calculate our gradient vector, we can start by computing the partial derivative of the loss function with respect to <span class="math inline">\(\theta_0\)</span>: <span class="math display">\[\frac{\partial}{\partial \theta_{0}} l(\vec{\theta}, \vec{x}, y_i) = 2(y_i - \theta_0x_0 - \theta_1x_1)(-x_0)\]</span></p>
+<p>Let’s now do the same but with respect to <span class="math inline">\(\theta_1\)</span>: <span class="math display">\[\frac{\partial}{\partial \theta_{1}} l(\vec{\theta}, \vec{x}, y_i) = 2(y_i - \theta_0x_0 - \theta_1x_1)(-x_1)\]</span></p>
+<p>Putting this together, our gradient vector is: <span class="math display">\[\nabla_{\theta} l(\vec{\theta}, \vec{x}, y_i) =  \begin{bmatrix} -2(y_i - \theta_0x_0 - \theta_1x_1)(x_0) \\ -2(y_i - \theta_0x_0 - \theta_1x_1)(x_1) \end{bmatrix}\]</span></p>
+<p>Remember that we need to keep updating <span class="math inline">\(\theta\)</span> until the algorithm <strong>converges</strong> to a solution and stops updating significantly (or at all). When updating <span class="math inline">\(\theta\)</span>, we’ll have a fixed number of updates and subsequent updates will be quite small (we won’t change <span class="math inline">\(\theta\)</span> by much).</p>
+</section>
+<section id="stochastic-mini-batch-gradient-descent" class="level3" data-number="14.1.2">
+<h3 data-number="14.1.2" class="anchored" data-anchor-id="stochastic-mini-batch-gradient-descent"><span class="header-section-number">14.1.2</span> Stochastic (Mini-batch) Gradient Descent</h3>
+<p>Let’s now dive deeper into gradient and stochastic gradient descent. In the previous lecture, we discussed how finding the gradient across all the data is extremeley computationally taxing and takes a lot of resources to calculate.</p>
+<p>We know that the solution to the normal equation is <span class="math inline">\(\hat{\theta} = (\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y}\)</span>. Let’s break this down and determine the computational complexity for this solution.</p>
+<center>
+<img src="images/complexity_normal_solution.png" alt="complexity_normal_solution" width="600">
+</center>
+<p>Let <span class="math inline">\(n\)</span> be the number of samples (rows) and <span class="math inline">\(d\)</span> be the number of features (columns).</p>
+<ul>
+<li>Computing <span class="math inline">\((\mathbb{X}^{\top}\mathbb{X})\)</span> takes <span class="math inline">\(O(nd^2)\)</span> time, and it’s inverse takes another <span class="math inline">\(O(d^3)\)</span> time to calculate; overall, <span class="math inline">\((\mathbb{X}^{\top}\mathbb{X})^{-1}\)</span> takes <span class="math inline">\(O(nd^2) + O(d^3)\)</span> time.</li>
+<li><span class="math inline">\(\mathbb{X}^{\top}\mathbb{Y}\)</span> takes <span class="math inline">\(O(nd)\)</span> time.</li>
+<li>Multiplying <span class="math inline">\((\mathbb{X}^{\top}\mathbb{X})^{-1}\)</span> and <span class="math inline">\(\mathbb{X}^{\top}\mathbb{Y}\)</span> takes <span class="math inline">\(O(d^2)\)</span> time.</li>
+</ul>
+<p>In total, calculating the solution to the normal equation takes <span class="math inline">\(O(nd^2) + O(d^3) + O(nd) + O(d^2)\)</span> time. We can see that <span class="math inline">\(O(nd^2) + O(d^3)\)</span> dominates the complexity — this can be problematic for high-dimensional models and very large datasets.</p>
+<p>On the other hand, the time complexity of a single gradient descent step takes only <span class="math inline">\(O(nd)\)</span> time.</p>
+<center>
+<img src="images/complexity_grad_descent.png" alt="complexity_grad_descent" width="600">
+</center>
+<p>Suppose we run <span class="math inline">\(T\)</span> iterations. The final complexity would then be <span class="math inline">\(O(Tnd)\)</span>. Typically, <span class="math inline">\(n\)</span> is much larger than <span class="math inline">\(T\)</span> and <span class="math inline">\(d\)</span>. How can we reduce the cost of this algorithm using a technique from Data 100? Do we really need to use <span class="math inline">\(n\)</span> data points? We don’t! Instead, we can use stochastic gradient descent.</p>
+<p>We know that our true gradient of <span class="math inline">\(\nabla_{\vec{\theta}} L (\vec{\theta^{(t)}}) = \frac{1}{n}\sum_{i=1}^{n}\nabla_{\vec{\theta}} l(y_i, f_{\vec{\theta}^{(t)}}(X_i))\)</span> has a time complexity of <span class="math inline">\(O(nd)\)</span>. Instead of using all <span class="math inline">\(n\)</span> samples to calculate the true gradient of the loss surface, let’s use a sample of our data to approximate. Say we sample <span class="math inline">\(b\)</span> records (<span class="math inline">\(s_1, \cdots, s_b\)</span>) from our <span class="math inline">\(n\)</span> datapoints. Our new (stochastic) gradient descent function will be <span class="math inline">\(\nabla_{\vec{\theta}} L (\vec{\theta^{(t)}}) = \frac{1}{b}\sum_{i=1}^{b}\nabla_{\vec{\theta}} l(y_{s_i}, f_{\vec{\theta}^{(t)}}(X_{s_i}))\)</span> and will now have a time complexity of <span class="math inline">\(O(bd)\)</span>, which is much faster!</p>
+<p>Stochastic gradient descent helps us approximate the gradient while also reducing the time complexity and computational cost. The time complexity scales with the number of datapoints selected in the sample. To sample data, there are two approaches we can use:</p>
+<ol type="1">
+<li>Shuffle the data and select samples one at a time.</li>
+<li>Take a simple random sample for each gradient computation.</li>
+</ol>
+<p>But how do we decide our mini-batch size (<span class="math inline">\(b\)</span>), or the number of datapoints in our sample? The original stochastic gradient descent algorithm uses <span class="math inline">\(b=1\)</span> so that only one sample is used to approximate the gradient at a time. Although we don’t use such a small mini-batch size often, <span class="math inline">\(b\)</span> typically is small. When choosing <span class="math inline">\(b\)</span>, there are several factors to consider: a larger batch size results in a better gradient estimate, parallelism, and other systems factors. On the other hand, a smaller batch size will be faster and have more frequent updates. It is up to data scientists to balance the tradeoff between batch size and time complexity.</p>
+<p>Summarizing our two gradient descent techniques:</p>
+<ul>
+<li><strong>(Batch) Gradient Descent</strong>: Gradient descent computes the <strong>true</strong> descent and always descends towards the true minimum of the loss. While accurate, it can often be computationally expensive.</li>
+</ul>
+<center>
+<img src="images/gd.png" alt="batch_grad_descent" width="300">
+</center>
+<ul>
+<li><strong>(Minibatch) Stochastic gradient descent</strong>: Stochastic gradient descent <strong>approximates</strong> the true gradient descent. It may not descend towards the true minimum with each update, but it’s often less computationally expensive than batch gradient descent.</li>
+</ul>
+<center>
+<img src="images/sgd.png" alt="stochastic_grad_descent" width="300">
+</center>
+</section>
+</section>
+<section id="feature-engineering" class="level2" data-number="14.2">
+<h2 data-number="14.2" class="anchored" data-anchor-id="feature-engineering"><span class="header-section-number">14.2</span> Feature Engineering</h2>
+<p>At this point in the course, we’ve equipped ourselves with some powerful techniques to build and optimize models. We’ve explored how to develop models of multiple variables, as well as how to transform variables to help <strong>linearize</strong> a dataset and fit these models to maximize their performance.</p>
+<p>All of this was done with one major caveat: the regression models we’ve worked with so far are all <strong>linear in the input variables</strong>. We’ve assumed that our predictions should be some combination of linear variables. While this works well in some cases, the real world isn’t always so straightforward. We’ll learn an important method to address this issue – feature engineering – and consider some new problems that can arise when we do so.</p>
+<p>Feature engineering is the process of <em>transforming</em> raw features into <em>more informative features</em> that can be used in modeling or EDA tasks and improve model performance.</p>
+<p>Feature engineering allows you to:</p>
+<ul>
+<li>Capture domain knowledge</li>
+<li>Express non-linear relationships using linear models</li>
+<li>Use non-numeric (qualitative) features in models</li>
+</ul>
+</section>
+<section id="feature-functions" class="level2" data-number="14.3">
+<h2 data-number="14.3" class="anchored" data-anchor-id="feature-functions"><span class="header-section-number">14.3</span> Feature Functions</h2>
+<p>A <strong>feature function</strong> describes the transformations we apply to raw features in a dataset to create a design matrix of transformed features. We typically denote the feature function as <span class="math inline">\(\Phi\)</span> (the Greek letter “phi” that we use to represent the true function). When we apply the feature function to our original dataset <span class="math inline">\(\mathbb{X}\)</span>, the result, <span class="math inline">\(\Phi(\mathbb{X})\)</span>, is a transformed design matrix ready to be used in modeling.</p>
+<p>For example, we might design a feature function that computes the square of an existing feature and adds it to the design matrix. In this case, our existing matrix <span class="math inline">\([x]\)</span> is transformed to <span class="math inline">\([x, x^2]\)</span>. Its <em>dimension</em> increases from 1 to 2. Often, the dimension of the <em>featurized</em> dataset increases as seen here.</p>
+<center>
+<img src="images/phi.png" alt="phi" width="700">
+</center>
+<p>The new features introduced by the feature function can then be used in modeling. Often, we use the symbol <span class="math inline">\(\phi_i\)</span> to represent transformed features after feature engineering.</p>
+<p><span class="math display">\[
+\begin{align}
+\hat{y} &amp;= \theta_0 + \theta_1 x + \theta_2 x^2 \\
+\hat{y} &amp;= \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2
+\end{align}
+\]</span></p>
+<p>In matrix notation, the symbol <span class="math inline">\(\Phi\)</span> is sometimes used to denote the design matrix after feature engineering has been performed. Note that in the usage below, <span class="math inline">\(\Phi\)</span> is now a feature-engineered matrix, rather than a function.</p>
+<p><span class="math display">\[\hat{\mathbb{Y}} = \Phi \theta\]</span></p>
+<p>More formally, we describe a feature function as transforming the original <span class="math inline">\(\mathbb{R}^{n \times p}\)</span> dataset <span class="math inline">\(\mathbb{X}\)</span> to a featurized <span class="math inline">\(\mathbb{R}^{n \times p'}\)</span> dataset <span class="math inline">\(\mathbb{\Phi}\)</span>, where <span class="math inline">\(p'\)</span> is typically greater than <span class="math inline">\(p\)</span>.</p>
+<p><span class="math display">\[\mathbb{X} \in \mathbb{R}^{n \times p} \longrightarrow \Phi \in \mathbb{R}^{n \times p'}\]</span></p>
+</section>
+<section id="one-hot-encoding" class="level2" data-number="14.4">
+<h2 data-number="14.4" class="anchored" data-anchor-id="one-hot-encoding"><span class="header-section-number">14.4</span> One Hot Encoding</h2>
+<p>Feature engineering opens up a whole new set of possibilities for designing better-performing models. As you will see in lab and homework, feature engineering is one of the most important parts of the entire modeling process.</p>
+<p>A particularly powerful use of feature engineering is to allow us to perform regression on <em>non-numeric</em> features. <strong>One hot encoding</strong> is a feature engineering technique that generates numeric features from categorical data, allowing us to use our usual methods to fit a regression model on the data.</p>
+<p>To illustrate how this works, we’ll refer back to the <code>tips</code> dataset from previous lectures. Consider the <code>"day"</code> column of the dataset:</p>
+<div id="b9a4080e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.linear_model <span class="im">as</span> lm</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>tips <span class="op">=</span> sns.load_dataset(<span class="st">"tips"</span>)</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>tips.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="1">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">total_bill</th>
+<th data-quarto-table-cell-role="th">tip</th>
+<th data-quarto-table-cell-role="th">sex</th>
+<th data-quarto-table-cell-role="th">smoker</th>
+<th data-quarto-table-cell-role="th">day</th>
+<th data-quarto-table-cell-role="th">time</th>
+<th data-quarto-table-cell-role="th">size</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>16.99</td>
+<td>1.01</td>
+<td>Female</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>2</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>10.34</td>
+<td>1.66</td>
+<td>Male</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>3</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>21.01</td>
+<td>3.50</td>
+<td>Male</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>3</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>23.68</td>
+<td>3.31</td>
+<td>Male</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>2</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>24.59</td>
+<td>3.61</td>
+<td>Female</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>4</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>At first glance, it doesn’t seem possible to fit a regression model to this data – we can’t directly perform any mathematical operations on the entry “Sun”.</p>
+<p>To resolve this, we instead create a new table with a feature for each unique value in the original <code>"day"</code> column. We then iterate through the <code>"day"</code> column. For each entry in <code>"day"</code> we fill the corresponding feature in the new table with 1. All other features are set to 0.</p>
+<center>
+<img src="images/ohe.png" alt="ohe" width="600">
+</center>
+<p><br></p>
+In short, each category of a categorical variable gets its own feature
+<ul>
+<li>
+Value = 1 if a row belongs to the category
+</li>
+<li>
+Value = 0 otherwise
+</li>
+</ul>
+<p>The <code>OneHotEncoder</code> class of <code>sklearn</code> (<a href="https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder.get_feature_names_out">documentation</a>) offers a quick way to perform this one-hot encoding. You will explore its use in detail in the lab. For now, recognize that we follow a very similar workflow to when we were working with the <code>LinearRegression</code> class: we initialize a <code>OneHotEncoder</code> object, fit it to our data, and finally use <code>.transform</code> to apply the fitted encoder.</p>
+<div id="716682ff" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> OneHotEncoder</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize a OneHotEncoder object</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>ohe <span class="op">=</span> OneHotEncoder()</span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit the encoder</span></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>ohe.fit(tips[[<span class="st">"day"</span>]])</span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Use the encoder to transform the raw "day" feature</span></span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>encoded_day <span class="op">=</span> ohe.transform(tips[[<span class="st">"day"</span>]]).toarray()</span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>encoded_day_df <span class="op">=</span> pd.DataFrame(encoded_day, columns<span class="op">=</span>ohe.get_feature_names_out())</span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>encoded_day_df.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="2">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">day_Fri</th>
+<th data-quarto-table-cell-role="th">day_Sat</th>
+<th data-quarto-table-cell-role="th">day_Sun</th>
+<th data-quarto-table-cell-role="th">day_Thur</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>0.0</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>0.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>0.0</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>0.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>0.0</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>0.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>0.0</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>0.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>0.0</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>0.0</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>The one-hot encoded features can then be used in the design matrix to train a model:</p>
+<center>
+<img src="images/ohemodel.png" alt="ohemodel" width="600">
+</center>
+<p><span class="math display">\[\hat{y} = \theta_1 (\text{total}\_\text{bill}) + \theta_2 (\text{size}) + \theta_3 (\text{day}\_\text{Fri}) + \theta_4 (\text{day}\_\text{Sat}) + \theta_5 (\text{day}\_\text{Sun}) + \theta_6 (\text{day}\_\text{Thur})\]</span></p>
+<p>Or in shorthand:</p>
+<p><span class="math display">\[\hat{y} = \theta_{1}\phi_{1} + \theta_{2}\phi_{2} + \theta_{3}\phi_{3} + \theta_{4}\phi_{4} + \theta_{5}\phi_{5} + \theta_{6}\phi_{6}\]</span></p>
+<p>Now, the <code>day</code> feature (or rather, the four new boolean features that represent day) can be used to fit a model.</p>
+<p>Using <code>sklearn</code> to fit the new model, we can determine the model coefficients, allowing us to understand how each feature impacts the predicted tip.</p>
+<div id="83d3569c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>data_w_ohe <span class="op">=</span> tips[[<span class="st">"total_bill"</span>, <span class="st">"size"</span>, <span class="st">"day"</span>]].join(encoded_day_df).drop(columns <span class="op">=</span> <span class="st">"day"</span>)</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>ohe_model <span class="op">=</span> lm.LinearRegression(fit_intercept<span class="op">=</span><span class="va">False</span>) <span class="co">#Tell sklearn to not add an additional bias column. Why?</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>ohe_model.fit(data_w_ohe, tips[<span class="st">"tip"</span>])</span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"Feature"</span>:data_w_ohe.columns, <span class="st">"Model Coefficient"</span>:ohe_model.coef_})</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Feature</th>
+<th data-quarto-table-cell-role="th">Model Coefficient</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>total_bill</td>
+<td>0.092994</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>size</td>
+<td>0.187132</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>day_Fri</td>
+<td>0.745787</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>day_Sat</td>
+<td>0.621129</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>day_Sun</td>
+<td>0.732289</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">5</td>
+<td>day_Thur</td>
+<td>0.668294</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>For example, when looking at the coefficient for <code>day_Fri</code>, we can now understand the impact of it being Friday on the predicted tip — if it is a Friday, the predicted tip increases by approximately $0.75.</p>
+<p>When one-hot encoding, keep in mind that any set of one-hot encoded columns will always sum to a column of all ones, representing the bias column. More formally, the bias column is a linear combination of the OHE columns.</p>
+<center>
+<img src="images/bias.png" alt="bias" width="600">
+</center>
+<p>We must be careful not to include this bias column in our design matrix. Otherwise, there will be linear dependence in the model, meaning <span class="math inline">\(\mathbb{X}^{\top}\mathbb{X}\)</span> would no longer be invertible, and our OLS estimate <span class="math inline">\(\hat{\theta} = (\mathbb{X}^{\top}\mathbb{X})^{-1}\mathbb{X}^{\top}\mathbb{Y}\)</span> fails.</p>
+<p>To resolve this issue, we simply omit one of the one-hot encoded columns <em>or</em> do not include an intercept term. The adjusted design matrices are shown below.</p>
+<center>
+<img src="images/remove.png" alt="remove" width="600">
+</center>
+<p>Either approach works — we still retain the same information as the omitted column being a linear combination of the remaining columns.</p>
+</section>
+<section id="polynomial-features" class="level2" data-number="14.5">
+<h2 data-number="14.5" class="anchored" data-anchor-id="polynomial-features"><span class="header-section-number">14.5</span> Polynomial Features</h2>
+<p>We have encountered a few cases now where models with linear features have performed poorly on datasets that show clear non-linear curvature.</p>
+<p>As an example, consider the <code>vehicles</code> dataset, which contains information about cars. Suppose we want to use the <code>hp</code> (horsepower) of a car to predict its <code>"mpg"</code> (gas mileage in miles per gallon). If we visualize the relationship between these two variables, we see a non-linear curvature. Fitting a linear model to these variables results in a high (poor) value of RMSE.</p>
+<p><span class="math display">\[\hat{y} = \theta_0 + \theta_1 (\text{hp})\]</span></p>
+<div id="f832461d" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>pd.options.mode.chained_assignment <span class="op">=</span> <span class="va">None</span> </span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>vehicles <span class="op">=</span> sns.load_dataset(<span class="st">"mpg"</span>).dropna().rename(columns <span class="op">=</span> {<span class="st">"horsepower"</span>: <span class="st">"hp"</span>}).sort_values(<span class="st">"hp"</span>)</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> vehicles[[<span class="st">"hp"</span>]]</span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> vehicles[<span class="st">"mpg"</span>]</span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>hp_model <span class="op">=</span> lm.LinearRegression()</span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>hp_model.fit(X, Y)</span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>hp_model_predictions <span class="op">=</span> hp_model.predict(X)</span>
+<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(data<span class="op">=</span>vehicles, x<span class="op">=</span><span class="st">"hp"</span>, y<span class="op">=</span><span class="st">"mpg"</span>)</span>
+<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>plt.plot(vehicles[<span class="st">"hp"</span>], hp_model_predictions, c<span class="op">=</span><span class="st">"tab:red"</span>)<span class="op">;</span></span>
+<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"MSE of model with (hp) feature: </span><span class="sc">{</span>np<span class="sc">.</span>mean((Y<span class="op">-</span>hp_model_predictions)<span class="op">**</span><span class="dv">2</span>)<span class="sc">}</span><span class="ss">"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>MSE of model with (hp) feature: 23.943662938603108</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="feature_engineering_files/figure-html/cell-5-output-2.png" width="585" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>As we can see from the plot, the data follows a curved line rather than a straight one. To capture this non-linearity, we can incorporate <strong>non-linear</strong> features. Let’s introduce a <strong>polynomial</strong> term, <span class="math inline">\(\text{hp}^2\)</span>, into our regression model. The model now takes the form:</p>
+<p><span class="math display">\[\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)\]</span> <span class="math display">\[\hat{y} = \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2\]</span></p>
+<p>How can we fit a model with non-linear features? We can use the exact same techniques as before: ordinary least squares, gradient descent, or <code>sklearn</code>. This is because our new model is still a <strong>linear model</strong>. Although it contains non-linear <em>features</em>, it is linear with respect to the model <em>parameters</em>. All of our previous work on fitting models was done under the assumption that we were working with linear models. Because our new model is still linear, we can apply our existing methods to determine the optimal parameters.</p>
+<div id="e225f38e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Add a hp^2 feature to the design matrix</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> vehicles[[<span class="st">"hp"</span>]]</span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>X[<span class="st">"hp^2"</span>] <span class="op">=</span> vehicles[<span class="st">"hp"</span>]<span class="op">**</span><span class="dv">2</span></span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Use sklearn to fit the model</span></span>
+<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>hp2_model <span class="op">=</span> lm.LinearRegression()</span>
+<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>hp2_model.fit(X, Y)</span>
+<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>hp2_model_predictions <span class="op">=</span> hp2_model.predict(X)</span>
+<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(data<span class="op">=</span>vehicles, x<span class="op">=</span><span class="st">"hp"</span>, y<span class="op">=</span><span class="st">"mpg"</span>)</span>
+<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a>plt.plot(vehicles[<span class="st">"hp"</span>], hp2_model_predictions, c<span class="op">=</span><span class="st">"tab:red"</span>)<span class="op">;</span></span>
+<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"MSE of model with (hp^2) feature: </span><span class="sc">{</span>np<span class="sc">.</span>mean((Y<span class="op">-</span>hp2_model_predictions)<span class="op">**</span><span class="dv">2</span>)<span class="sc">}</span><span class="ss">"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>MSE of model with (hp^2) feature: 18.98476890761722</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="feature_engineering_files/figure-html/cell-6-output-2.png" width="585" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Looking a lot better! By incorporating a squared feature, we are able to capture the curvature of the dataset. Our model is now a parabola centered on our data. Notice that our new model’s error has decreased relative to the original model with linear features.</p>
+</section>
+<section id="complexity-and-overfitting" class="level2" data-number="14.6">
+<h2 data-number="14.6" class="anchored" data-anchor-id="complexity-and-overfitting"><span class="header-section-number">14.6</span> Complexity and Overfitting</h2>
+<p>We’ve seen now that feature engineering allows us to build all sorts of features to improve the performance of the model. In particular, we saw that designing a more complex feature (squaring <code>hp</code> in the <code>vehicles</code> data previously) substantially improved the model’s ability to capture non-linear relationships. To take full advantage of this, we might be inclined to design increasingly complex features. Consider the following three models, each of different order (the maximum exponent power of each model):</p>
+<ul>
+<li>Model with order 2: <span class="math inline">\(\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)\)</span></li>
+<li>Model with order 3: <span class="math inline">\(\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3)\)</span></li>
+<li>Model with order 4: <span class="math inline">\(\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3) + \theta_4 (\text{hp}^4)\)</span></li>
+</ul>
+<center>
+<img src="images/degree_comparison.png" alt="degree_comparison" width="900">
+</center>
+<p>As we can see in the plots above, MSE continues to decrease with each additional polynomial term. To visualize it further, let’s plot models as the complexity increases from 0 to 7:</p>
+<center>
+<img src="images/degree_comparison2.png" alt="degree_comparison" width="850">
+</center>
+<p>When we use our model to make predictions on the same data that was used to fit the model, we find that the MSE decreases with each additional polynomial term (as our model gets more complex). The <strong>training error</strong> is the model’s error when generating predictions from the same data that was used for training purposes. We can conclude that the training error goes down as the complexity of the model increases.</p>
+<center>
+<img src="images/train_error.png" alt="train_error" width="400">
+</center>
+<p>This seems like good news – when working on the <strong>training data</strong>, we can improve model performance by designing increasingly complex models.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Math Fact: Polynomial Degrees
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Given <span class="math inline">\(N\)</span> overlapping data points, we can always find a polynomial of degree <span class="math inline">\(N-1\)</span> that goes through all those points.</p>
+For example, there always exists a degree-4 polynomial curve that can perfectly model a dataset of 5 datapoints:
+<center>
+<img src="images/perfect_poly_fits.png" alt="train_error" width="600">
+</center>
+</div>
+</div>
+<p>However, high model complexity comes with its own set of issues. When building the <code>vehicles</code> models above, we trained the models on the <em>entire</em> dataset and then evaluated their performance on this same dataset. In reality, we are likely to instead train the model on a <em>sample</em> from the population, then use it to make predictions on data it didn’t encounter during training.</p>
+<p>Let’s walk through a more realistic example. Say we are given a training dataset of just 6 datapoints and want to train a model to then make predictions on a <em>different</em> set of points. We may be tempted to make a highly complex model (e.g., degree 5), especially given it makes perfect predictions on the training data as clear on the left. However, as shown in the graph on the right, this model would perform <em>horribly</em> on the rest of the population!</p>
+<center>
+<img src="images/complex.png" alt="complex" width="600">
+</center>
+<p>This phenomenon called <strong>overfitting</strong>. The model effectively just memorized the training data it encountered when it was fitted, leaving it unable to <strong>generalize</strong> well to data it didn’t encounter during training. This is a problem: we want models that are generalizable to “unseen” data.</p>
+<p>Additionally, since complex models are sensitive to the specific dataset used to train them, they have high <strong>variance</strong>. A model with high variance tends to <em>vary</em> more dramatically when trained on different datasets. Going back to our example above, we can see our degree-5 model varies erratically when we fit it to different samples of 6 points from <code>vehicles</code>.</p>
+<center>
+<img src="images/resamples.png" alt="resamples" width="800">
+</center>
+<p>We now face a dilemma: we know that we can <strong>decrease training error</strong> by increasing model complexity, but models that are <em>too</em> complex start to overfit and can’t be reapplied to new datasets due to <strong>high variance</strong>.</p>
+<center>
+<img src="images/bvt.png" alt="bvt" width="400">
+</center>
+<p>We can see that there is a clear trade-off that comes from the complexity of our model. As model complexity increases, the model’s error on the training data decreases. At the same time, the model’s variance tends to increase.</p>
+<p>The takeaway here: we need to strike a balance in the complexity of our models; we want models that are generalizable to “unseen” data. A model that is too simple won’t be able to capture the key relationships between our variables of interest; a model that is too complex runs the risk of overfitting.</p>
+<p>This begs the question: how do we control the complexity of a model? Stay tuned for Lecture 17 on Cross-Validation and Regularization!</p>
+</section>
+<section id="bonus-stochastic-gradient-descent-in-pytorch" class="level2" data-number="14.7">
+<h2 data-number="14.7" class="anchored" data-anchor-id="bonus-stochastic-gradient-descent-in-pytorch"><span class="header-section-number">14.7</span> [Bonus] Stochastic Gradient Descent in <code>PyTorch</code></h2>
+<p>While this material is out of scope for Data 100, it is useful if you plan to enter a career in data science!</p>
+<p>In practice, you will use software packages such as <code>PyTorch</code> when computing gradients and implementing gradient descent. You’ll often follow three main steps:</p>
+<ol type="1">
+<li>Sample a batch of the data.</li>
+<li>Compute the loss and the gradient.</li>
+<li>Update your gradient until you reach an appropriate estimate of the true gradient.</li>
+</ol>
+<center>
+<img src="images/pytorchsgd.png" alt="pytorch_sgd" width="500">
+</center>
+<p>If you want to learn more, this <a href="https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html">Intro to PyTorch tutorial</a> is a great resource to get started!</p>
+
+
+<!-- -->
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../gradient_descent/gradient_descent.html" class="pagination-link" aria-label="sklearn and Gradient Descent">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../case_study_HCE/case_study_HCE.html" class="pagination-link" aria-label="Case Study in Human Contexts and Ethics">
+        <span class="nav-page-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb8" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Feature Engineering</span></span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="co">  warning: false</span></span>
+<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: false</span></span>
+<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Feature Engineering</span></span>
+<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb8-21"><a href="#cb8-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb8-22"><a href="#cb8-22" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb8-23"><a href="#cb8-23" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb8-24"><a href="#cb8-24" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb8-25"><a href="#cb8-25" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb8-26"><a href="#cb8-26" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb8-27"><a href="#cb8-27" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb8-28"><a href="#cb8-28" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb8-29"><a href="#cb8-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-30"><a href="#cb8-30" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb8-31"><a href="#cb8-31" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb8-32"><a href="#cb8-32" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Recognize the value of feature engineering as a tool to improve model performance</span>
+<span id="cb8-33"><a href="#cb8-33" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Implement polynomial feature generation and one hot encoding</span>
+<span id="cb8-34"><a href="#cb8-34" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Understand the interactions between model complexity, model variance, and training error</span>
+<span id="cb8-35"><a href="#cb8-35" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb8-36"><a href="#cb8-36" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-37"><a href="#cb8-37" aria-hidden="true" tabindex="-1"></a>At this point, we've grown quite familiar with the modeling process. We've introduced the concept of loss, used it to fit several types of models, and, most recently, extended our analysis to multiple regression. Along the way, we've forged our way through the mathematics of deriving the optimal model parameters in all its gory detail. It's time to make our lives a little easier – let's implement the modeling process in code!</span>
+<span id="cb8-38"><a href="#cb8-38" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-39"><a href="#cb8-39" aria-hidden="true" tabindex="-1"></a>In this lecture, we'll explore two techniques for model fitting:</span>
+<span id="cb8-40"><a href="#cb8-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-41"><a href="#cb8-41" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Translating our derived formulas for regression to <span class="in">`python`</span></span>
+<span id="cb8-42"><a href="#cb8-42" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Using <span class="in">`python`</span>'s <span class="in">`sklearn`</span> package</span>
+<span id="cb8-43"><a href="#cb8-43" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-44"><a href="#cb8-44" aria-hidden="true" tabindex="-1"></a>With our new programming frameworks in hand, we will also add sophistication to our models by introducing more complex features to enhance model performance. </span>
+<span id="cb8-45"><a href="#cb8-45" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-46"><a href="#cb8-46" aria-hidden="true" tabindex="-1"></a><span class="fu">## Gradient Descent Cont.</span></span>
+<span id="cb8-47"><a href="#cb8-47" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-48"><a href="#cb8-48" aria-hidden="true" tabindex="-1"></a>Before we dive into feature engineering, let's quickly review gradient descent, which we covered in the last lecture. Recall that gradient descent is a powerful technique for choosing the model parameters that minimize the loss function.  </span>
+<span id="cb8-49"><a href="#cb8-49" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-50"><a href="#cb8-50" aria-hidden="true" tabindex="-1"></a><span class="fu">### Gradient Descent Review</span></span>
+<span id="cb8-51"><a href="#cb8-51" aria-hidden="true" tabindex="-1"></a>As we learned earlier, we set the derivative of the loss function to zero and solve to determine the optimal parameters $\theta$ that minimize loss. For a loss surface in 2D (or higher), the best way to minimize loss is to "walk" down the loss surface until we reach our optimal parameters $\vec{\theta}$. The **gradient vector** tells us which direction to "walk" in.</span>
+<span id="cb8-52"><a href="#cb8-52" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-53"><a href="#cb8-53" aria-hidden="true" tabindex="-1"></a>For example, the *vector* of parameter values $\vec{\theta} = \begin{bmatrix}</span>
+<span id="cb8-54"><a href="#cb8-54" aria-hidden="true" tabindex="-1"></a>           \theta_{0} <span class="sc">\\</span></span>
+<span id="cb8-55"><a href="#cb8-55" aria-hidden="true" tabindex="-1"></a>           \theta_{1} <span class="sc">\\</span></span>
+<span id="cb8-56"><a href="#cb8-56" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix}$ gives us a two parameter model (d = 2). To calculate our gradient vector, we can take the *partial derivative* of loss with respect to each parameter: $\frac{\partial L}{\partial \theta_0}$ and $\frac{\partial L}{\partial \theta_1}$. </span>
+<span id="cb8-57"><a href="#cb8-57" aria-hidden="true" tabindex="-1"></a>         </span>
+<span id="cb8-58"><a href="#cb8-58" aria-hidden="true" tabindex="-1"></a>Its **gradient vector** would then be the 2D vector: </span>
+<span id="cb8-59"><a href="#cb8-59" aria-hidden="true" tabindex="-1"></a>$$\nabla_{\vec{\theta}} L =  \begin{bmatrix} \frac{\partial L}{\partial \theta_0} <span class="sc">\\</span> \frac{\partial L}{\partial \theta_1} \end{bmatrix}$$</span>
+<span id="cb8-60"><a href="#cb8-60" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-61"><a href="#cb8-61" aria-hidden="true" tabindex="-1"></a>Note that $-\nabla_{\vec{\theta}} L$ always points in the **downhill direction** of the surface. </span>
+<span id="cb8-62"><a href="#cb8-62" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-63"><a href="#cb8-63" aria-hidden="true" tabindex="-1"></a>Recall that we also discussed the gradient descent update rule, where we nudge $\theta$ in a negative gradient direction until $\theta$ converges. </span>
+<span id="cb8-64"><a href="#cb8-64" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-65"><a href="#cb8-65" aria-hidden="true" tabindex="-1"></a>As a refresher, the rule is as follows: $$\vec{\theta}^{(t+1)} = \vec{\theta}^{(t)} - \alpha \nabla_{\vec{\theta}} L(\vec{\theta}^{(t)}) $$</span>
+<span id="cb8-66"><a href="#cb8-66" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-67"><a href="#cb8-67" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\theta$ is a vector with our model weights</span>
+<span id="cb8-68"><a href="#cb8-68" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$L$ is the loss function</span>
+<span id="cb8-69"><a href="#cb8-69" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\alpha$ is the learning rate</span>
+<span id="cb8-70"><a href="#cb8-70" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\vec{\theta}^{(t)}$ is the current value of $\theta$</span>
+<span id="cb8-71"><a href="#cb8-71" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\vec{\theta}^{(t+1)}$ is the next value of $\theta$</span>
+<span id="cb8-72"><a href="#cb8-72" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\nabla_{\vec{\theta}} L(\vec{\theta}^{(t)})$ is the gradient of the loss function evaluated at the current $\theta$: $$\frac{1}{n}\sum_{i=1}^{n}\nabla_{\vec{\theta}} l(y_i, f_{\vec{\theta}^{(t)}}(X_i))$$</span>
+<span id="cb8-73"><a href="#cb8-73" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-74"><a href="#cb8-74" aria-hidden="true" tabindex="-1"></a>Let's now walk through an example of calculating and updating the gradient vector. Say our model and loss are:</span>
+<span id="cb8-75"><a href="#cb8-75" aria-hidden="true" tabindex="-1"></a>$$\begin{align}</span>
+<span id="cb8-76"><a href="#cb8-76" aria-hidden="true" tabindex="-1"></a>f_{\vec{\theta}}(\vec{x}) &amp;= \vec{x}^T\vec{\theta} = \theta_0x_0 + \theta_1x_1</span>
+<span id="cb8-77"><a href="#cb8-77" aria-hidden="true" tabindex="-1"></a><span class="sc">\\</span>l(y, \hat{y}) &amp;= (y - \hat{y})^2</span>
+<span id="cb8-78"><a href="#cb8-78" aria-hidden="true" tabindex="-1"></a>\end{align}</span>
+<span id="cb8-79"><a href="#cb8-79" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb8-80"><a href="#cb8-80" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-81"><a href="#cb8-81" aria-hidden="true" tabindex="-1"></a>Plugging in $f_{\vec{\theta}}(\vec{x})$ for $\hat{y}$, our loss function becomes $l(\vec{\theta}, \vec{x}, y_i) = (y_i - \theta_0x_0 - \theta_1x_1)^2$.</span>
+<span id="cb8-82"><a href="#cb8-82" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-83"><a href="#cb8-83" aria-hidden="true" tabindex="-1"></a>To calculate our gradient vector, we can start by computing the partial derivative of the loss function with respect to $\theta_0$: $$\frac{\partial}{\partial \theta_{0}} l(\vec{\theta}, \vec{x}, y_i) = 2(y_i - \theta_0x_0 - \theta_1x_1)(-x_0)$$</span>
+<span id="cb8-84"><a href="#cb8-84" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-85"><a href="#cb8-85" aria-hidden="true" tabindex="-1"></a>Let's now do the same but with respect to $\theta_1$: $$\frac{\partial}{\partial \theta_{1}} l(\vec{\theta}, \vec{x}, y_i) = 2(y_i - \theta_0x_0 - \theta_1x_1)(-x_1)$$</span>
+<span id="cb8-86"><a href="#cb8-86" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-87"><a href="#cb8-87" aria-hidden="true" tabindex="-1"></a>Putting this together, our gradient vector is: </span>
+<span id="cb8-88"><a href="#cb8-88" aria-hidden="true" tabindex="-1"></a>$$\nabla_{\theta} l(\vec{\theta}, \vec{x}, y_i) =  \begin{bmatrix} -2(y_i - \theta_0x_0 - \theta_1x_1)(x_0) <span class="sc">\\</span> -2(y_i - \theta_0x_0 - \theta_1x_1)(x_1) \end{bmatrix}$$</span>
+<span id="cb8-89"><a href="#cb8-89" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-90"><a href="#cb8-90" aria-hidden="true" tabindex="-1"></a>Remember that we need to keep updating $\theta$ until the algorithm **converges** to a solution and stops updating significantly (or at all). When updating $\theta$, we'll have a fixed number of updates and subsequent updates will be quite small (we won't change $\theta$ by much).</span>
+<span id="cb8-91"><a href="#cb8-91" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-92"><a href="#cb8-92" aria-hidden="true" tabindex="-1"></a><span class="fu">### Stochastic (Mini-batch) Gradient Descent</span></span>
+<span id="cb8-93"><a href="#cb8-93" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-94"><a href="#cb8-94" aria-hidden="true" tabindex="-1"></a>Let's now dive deeper into gradient and stochastic gradient descent. In the previous lecture, we discussed how finding the gradient across all the data is extremeley computationally taxing and takes a lot of resources to calculate.</span>
+<span id="cb8-95"><a href="#cb8-95" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-96"><a href="#cb8-96" aria-hidden="true" tabindex="-1"></a>We know that the solution to the normal equation is $\hat{\theta} = (\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y}$. Let's break this down and determine the computational complexity for this solution.</span>
+<span id="cb8-97"><a href="#cb8-97" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-98"><a href="#cb8-98" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/complexity_normal_solution.png" alt='complexity_normal_solution' width='600'&gt;&lt;/center&gt;</span>
+<span id="cb8-99"><a href="#cb8-99" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-100"><a href="#cb8-100" aria-hidden="true" tabindex="-1"></a> Let $n$ be the number of samples (rows) and $d$ be the number of features (columns). </span>
+<span id="cb8-101"><a href="#cb8-101" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-102"><a href="#cb8-102" aria-hidden="true" tabindex="-1"></a><span class="ss"> * </span>Computing $(\mathbb{X}^{\top}\mathbb{X})$ takes $O(nd^2)$ time, and it's inverse takes another $O(d^3)$ time to calculate; overall, $(\mathbb{X}^{\top}\mathbb{X})^{-1}$ takes $O(nd^2) + O(d^3)$ time. </span>
+<span id="cb8-103"><a href="#cb8-103" aria-hidden="true" tabindex="-1"></a><span class="ss"> * </span>$\mathbb{X}^{\top}\mathbb{Y}$ takes $O(nd)$ time.</span>
+<span id="cb8-104"><a href="#cb8-104" aria-hidden="true" tabindex="-1"></a><span class="ss"> * </span>Multiplying $(\mathbb{X}^{\top}\mathbb{X})^{-1}$ and $\mathbb{X}^{\top}\mathbb{Y}$ takes $O(d^2)$ time. </span>
+<span id="cb8-105"><a href="#cb8-105" aria-hidden="true" tabindex="-1"></a> </span>
+<span id="cb8-106"><a href="#cb8-106" aria-hidden="true" tabindex="-1"></a> In total, calculating the solution to the normal equation takes $O(nd^2) + O(d^3) + O(nd) + O(d^2)$ time. We can see that $O(nd^2) + O(d^3)$ dominates the complexity — this can be problematic for high-dimensional models and very large datasets.</span>
+<span id="cb8-107"><a href="#cb8-107" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-108"><a href="#cb8-108" aria-hidden="true" tabindex="-1"></a>On the other hand, the time complexity of a single gradient descent step takes only $O(nd)$ time. </span>
+<span id="cb8-109"><a href="#cb8-109" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-110"><a href="#cb8-110" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/complexity_grad_descent.png" alt='complexity_grad_descent' width='600'&gt;&lt;/center&gt;</span>
+<span id="cb8-111"><a href="#cb8-111" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-112"><a href="#cb8-112" aria-hidden="true" tabindex="-1"></a>Suppose we run $T$ iterations. The final complexity would then be $O(Tnd)$. Typically, $n$ is much larger than $T$ and $d$. How can we reduce the cost of this algorithm using a technique from Data 100? Do we really need to use $n$ data points? We don't! Instead, we can use stochastic gradient descent.</span>
+<span id="cb8-113"><a href="#cb8-113" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-114"><a href="#cb8-114" aria-hidden="true" tabindex="-1"></a>We know that our true gradient of $\nabla_{\vec{\theta}} L (\vec{\theta^{(t)}}) = \frac{1}{n}\sum_{i=1}^{n}\nabla_{\vec{\theta}} l(y_i, f_{\vec{\theta}^{(t)}}(X_i))$ has a time complexity of $O(nd)$. Instead of using all $n$ samples to calculate the true gradient of the loss surface, let's use a sample of our data to approximate. Say we sample $b$ records ($s_1, \cdots, s_b$) from our $n$ datapoints. Our new (stochastic) gradient descent function will be $\nabla_{\vec{\theta}} L (\vec{\theta^{(t)}}) = \frac{1}{b}\sum_{i=1}^{b}\nabla_{\vec{\theta}} l(y_{s_i}, f_{\vec{\theta}^{(t)}}(X_{s_i}))$ and will now have a time complexity of $O(bd)$, which is much faster! </span>
+<span id="cb8-115"><a href="#cb8-115" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-116"><a href="#cb8-116" aria-hidden="true" tabindex="-1"></a>Stochastic gradient descent helps us approximate the gradient while also reducing the time complexity and computational cost. The time complexity scales with the number of datapoints selected in the sample. To sample data, there are two approaches we can use:</span>
+<span id="cb8-117"><a href="#cb8-117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-118"><a href="#cb8-118" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Shuffle the data and select samples one at a time.</span>
+<span id="cb8-119"><a href="#cb8-119" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Take a simple random sample for each gradient computation.</span>
+<span id="cb8-120"><a href="#cb8-120" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-121"><a href="#cb8-121" aria-hidden="true" tabindex="-1"></a>But how do we decide our mini-batch size ($b$), or the number of datapoints in our sample? The original stochastic gradient descent algorithm uses $b=1$ so that only one sample is used to approximate the gradient at a time. Although we don't use such a small mini-batch size often, $b$ typically is small. When choosing $b$, there are several factors to consider: a larger batch size results in a better gradient estimate, parallelism, and other systems factors. On the other hand, a smaller batch size will be faster and have more frequent updates. It is up to data scientists to balance the tradeoff between batch size and time complexity.</span>
+<span id="cb8-122"><a href="#cb8-122" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-123"><a href="#cb8-123" aria-hidden="true" tabindex="-1"></a>Summarizing our two gradient descent techniques:</span>
+<span id="cb8-124"><a href="#cb8-124" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-125"><a href="#cb8-125" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>**(Batch) Gradient Descent**: Gradient descent computes the **true** descent and always descends towards the true minimum of the loss. While accurate, it can often be computationally expensive.</span>
+<span id="cb8-126"><a href="#cb8-126" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-127"><a href="#cb8-127" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/gd.png" alt='batch_grad_descent' width='300'&gt;&lt;/center&gt;</span>
+<span id="cb8-128"><a href="#cb8-128" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-129"><a href="#cb8-129" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>**(Minibatch) Stochastic gradient descent**: Stochastic gradient descent **approximates** the true gradient descent. It may not descend towards the true minimum with each update, but it's often less computationally expensive than batch gradient descent.</span>
+<span id="cb8-130"><a href="#cb8-130" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-131"><a href="#cb8-131" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/sgd.png" alt='stochastic_grad_descent' width='300'&gt;&lt;/center&gt;</span>
+<span id="cb8-132"><a href="#cb8-132" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-133"><a href="#cb8-133" aria-hidden="true" tabindex="-1"></a><span class="fu">## Feature Engineering</span></span>
+<span id="cb8-134"><a href="#cb8-134" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-135"><a href="#cb8-135" aria-hidden="true" tabindex="-1"></a>At this point in the course, we've equipped ourselves with some powerful techniques to build and optimize models. We've explored how to develop models of multiple variables, as well as how to transform variables to help **linearize** a dataset and fit these models to maximize their performance.</span>
+<span id="cb8-136"><a href="#cb8-136" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-137"><a href="#cb8-137" aria-hidden="true" tabindex="-1"></a>All of this was done with one major caveat: the regression models we've worked with so far are all **linear in the input variables**. We've assumed that our predictions should be some combination of linear variables. While this works well in some cases, the real world isn't always so straightforward. We'll learn an important method to address this issue – feature engineering – and consider some new problems that can arise when we do so.</span>
+<span id="cb8-138"><a href="#cb8-138" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-139"><a href="#cb8-139" aria-hidden="true" tabindex="-1"></a>Feature engineering is the process of *transforming* raw features into *more informative features* that can be used in modeling or EDA tasks and improve model performance.</span>
+<span id="cb8-140"><a href="#cb8-140" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-141"><a href="#cb8-141" aria-hidden="true" tabindex="-1"></a>Feature engineering allows you to:</span>
+<span id="cb8-142"><a href="#cb8-142" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-143"><a href="#cb8-143" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Capture domain knowledge </span>
+<span id="cb8-144"><a href="#cb8-144" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Express non-linear relationships using linear models</span>
+<span id="cb8-145"><a href="#cb8-145" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Use non-numeric (qualitative) features in models</span>
+<span id="cb8-146"><a href="#cb8-146" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-147"><a href="#cb8-147" aria-hidden="true" tabindex="-1"></a><span class="fu">## Feature Functions</span></span>
+<span id="cb8-148"><a href="#cb8-148" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-149"><a href="#cb8-149" aria-hidden="true" tabindex="-1"></a>A **feature function** describes the transformations we apply to raw features in a dataset to create a design matrix of transformed features. We typically denote the feature function as $\Phi$ (the Greek letter "phi" that we use to represent the true function). When we apply the feature function to our original dataset $\mathbb{X}$, the result, $\Phi(\mathbb{X})$, is a transformed design matrix ready to be used in modeling. </span>
+<span id="cb8-150"><a href="#cb8-150" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-151"><a href="#cb8-151" aria-hidden="true" tabindex="-1"></a>For example, we might design a feature function that computes the square of an existing feature and adds it to the design matrix. In this case, our existing matrix $<span class="co">[</span><span class="ot">x</span><span class="co">]</span>$ is transformed to $<span class="co">[</span><span class="ot">x, x^2</span><span class="co">]</span>$. Its *dimension* increases from 1 to 2. Often, the dimension of the *featurized* dataset increases as seen here.</span>
+<span id="cb8-152"><a href="#cb8-152" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-153"><a href="#cb8-153" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/phi.png" alt='phi' width='700'&gt;&lt;/center&gt;</span>
+<span id="cb8-154"><a href="#cb8-154" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-155"><a href="#cb8-155" aria-hidden="true" tabindex="-1"></a>The new features introduced by the feature function can then be used in modeling. Often, we use the symbol $\phi_i$ to represent transformed features after feature engineering. </span>
+<span id="cb8-156"><a href="#cb8-156" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-157"><a href="#cb8-157" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb8-158"><a href="#cb8-158" aria-hidden="true" tabindex="-1"></a>\begin{align}</span>
+<span id="cb8-159"><a href="#cb8-159" aria-hidden="true" tabindex="-1"></a>\hat{y} &amp;= \theta_0 + \theta_1 x + \theta_2 x^2 <span class="sc">\\</span></span>
+<span id="cb8-160"><a href="#cb8-160" aria-hidden="true" tabindex="-1"></a>\hat{y} &amp;= \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2</span>
+<span id="cb8-161"><a href="#cb8-161" aria-hidden="true" tabindex="-1"></a>\end{align}</span>
+<span id="cb8-162"><a href="#cb8-162" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb8-163"><a href="#cb8-163" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-164"><a href="#cb8-164" aria-hidden="true" tabindex="-1"></a>In matrix notation, the symbol $\Phi$ is sometimes used to denote the design matrix after feature engineering has been performed. Note that in the usage below, $\Phi$ is now a feature-engineered matrix, rather than a function.</span>
+<span id="cb8-165"><a href="#cb8-165" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-166"><a href="#cb8-166" aria-hidden="true" tabindex="-1"></a>$$\hat{\mathbb{Y}} = \Phi \theta$$</span>
+<span id="cb8-167"><a href="#cb8-167" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-168"><a href="#cb8-168" aria-hidden="true" tabindex="-1"></a>More formally, we describe a feature function as transforming the original $\mathbb{R}^{n \times p}$ dataset $\mathbb{X}$ to a featurized $\mathbb{R}^{n \times p'}$ dataset $\mathbb{\Phi}$, where $p'$ is typically greater than $p$. </span>
+<span id="cb8-169"><a href="#cb8-169" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-170"><a href="#cb8-170" aria-hidden="true" tabindex="-1"></a>$$\mathbb{X} \in \mathbb{R}^{n \times p} \longrightarrow \Phi \in \mathbb{R}^{n \times p'}$$</span>
+<span id="cb8-171"><a href="#cb8-171" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-172"><a href="#cb8-172" aria-hidden="true" tabindex="-1"></a><span class="fu">## One Hot Encoding</span></span>
+<span id="cb8-173"><a href="#cb8-173" aria-hidden="true" tabindex="-1"></a>Feature engineering opens up a whole new set of possibilities for designing better-performing models. As you will see in lab and homework, feature engineering is one of the most important parts of the entire modeling process.</span>
+<span id="cb8-174"><a href="#cb8-174" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-175"><a href="#cb8-175" aria-hidden="true" tabindex="-1"></a>A particularly powerful use of feature engineering is to allow us to perform regression on *non-numeric* features. **One hot encoding** is a feature engineering technique that generates numeric features from categorical data, allowing us to use our usual methods to fit a regression model on the data. </span>
+<span id="cb8-176"><a href="#cb8-176" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-177"><a href="#cb8-177" aria-hidden="true" tabindex="-1"></a>To illustrate how this works, we'll refer back to the <span class="in">`tips`</span> dataset from previous lectures. Consider the <span class="in">`"day"`</span> column of the dataset:</span>
+<span id="cb8-178"><a href="#cb8-178" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-181"><a href="#cb8-181" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb8-182"><a href="#cb8-182" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb8-183"><a href="#cb8-183" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb8-184"><a href="#cb8-184" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb8-185"><a href="#cb8-185" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb8-186"><a href="#cb8-186" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb8-187"><a href="#cb8-187" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.linear_model <span class="im">as</span> lm</span>
+<span id="cb8-188"><a href="#cb8-188" aria-hidden="true" tabindex="-1"></a>tips <span class="op">=</span> sns.load_dataset(<span class="st">"tips"</span>)</span>
+<span id="cb8-189"><a href="#cb8-189" aria-hidden="true" tabindex="-1"></a>tips.head()</span>
+<span id="cb8-190"><a href="#cb8-190" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-191"><a href="#cb8-191" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-192"><a href="#cb8-192" aria-hidden="true" tabindex="-1"></a>   At first glance, it doesn't seem possible to fit a regression model to this data – we can't directly perform any mathematical operations on the entry "Sun". </span>
+<span id="cb8-193"><a href="#cb8-193" aria-hidden="true" tabindex="-1"></a>  </span>
+<span id="cb8-194"><a href="#cb8-194" aria-hidden="true" tabindex="-1"></a>To resolve this, we instead create a new table with a feature for each unique value in the original <span class="in">`"day"`</span> column. We then iterate through the <span class="in">`"day"`</span> column. For each entry in <span class="in">`"day"`</span> we fill the corresponding feature in the new table with 1. All other features are set to 0.</span>
+<span id="cb8-195"><a href="#cb8-195" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-196"><a href="#cb8-196" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/ohe.png" alt='ohe' width='600'&gt;&lt;/center&gt;</span>
+<span id="cb8-197"><a href="#cb8-197" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-198"><a href="#cb8-198" aria-hidden="true" tabindex="-1"></a>&lt;br&gt; </span>
+<span id="cb8-199"><a href="#cb8-199" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-200"><a href="#cb8-200" aria-hidden="true" tabindex="-1"></a>In short, each category of a categorical variable gets its own feature</span>
+<span id="cb8-201"><a href="#cb8-201" aria-hidden="true" tabindex="-1"></a>&lt;ul&gt;</span>
+<span id="cb8-202"><a href="#cb8-202" aria-hidden="true" tabindex="-1"></a>   &lt;li&gt;</span>
+<span id="cb8-203"><a href="#cb8-203" aria-hidden="true" tabindex="-1"></a>      Value = 1 if a row belongs to the category</span>
+<span id="cb8-204"><a href="#cb8-204" aria-hidden="true" tabindex="-1"></a>   &lt;/li&gt;</span>
+<span id="cb8-205"><a href="#cb8-205" aria-hidden="true" tabindex="-1"></a>   &lt;li&gt;</span>
+<span id="cb8-206"><a href="#cb8-206" aria-hidden="true" tabindex="-1"></a>      Value = 0 otherwise</span>
+<span id="cb8-207"><a href="#cb8-207" aria-hidden="true" tabindex="-1"></a>   &lt;/li&gt;</span>
+<span id="cb8-208"><a href="#cb8-208" aria-hidden="true" tabindex="-1"></a>&lt;/ul&gt;</span>
+<span id="cb8-209"><a href="#cb8-209" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-210"><a href="#cb8-210" aria-hidden="true" tabindex="-1"></a>The <span class="in">`OneHotEncoder`</span> class of <span class="in">`sklearn`</span> (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder.get_feature_names_out)</span>) offers a quick way to perform this one-hot encoding. You will explore its use in detail in the lab. For now, recognize that we follow a very similar workflow to when we were working with the <span class="in">`LinearRegression`</span> class: we initialize a <span class="in">`OneHotEncoder`</span> object, fit it to our data, and finally use <span class="in">`.transform`</span> to apply the fitted encoder.</span>
+<span id="cb8-211"><a href="#cb8-211" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-214"><a href="#cb8-214" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb8-215"><a href="#cb8-215" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb8-216"><a href="#cb8-216" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb8-217"><a href="#cb8-217" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> OneHotEncoder</span>
+<span id="cb8-218"><a href="#cb8-218" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-219"><a href="#cb8-219" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize a OneHotEncoder object</span></span>
+<span id="cb8-220"><a href="#cb8-220" aria-hidden="true" tabindex="-1"></a>ohe <span class="op">=</span> OneHotEncoder()</span>
+<span id="cb8-221"><a href="#cb8-221" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-222"><a href="#cb8-222" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit the encoder</span></span>
+<span id="cb8-223"><a href="#cb8-223" aria-hidden="true" tabindex="-1"></a>ohe.fit(tips[[<span class="st">"day"</span>]])</span>
+<span id="cb8-224"><a href="#cb8-224" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-225"><a href="#cb8-225" aria-hidden="true" tabindex="-1"></a><span class="co"># Use the encoder to transform the raw "day" feature</span></span>
+<span id="cb8-226"><a href="#cb8-226" aria-hidden="true" tabindex="-1"></a>encoded_day <span class="op">=</span> ohe.transform(tips[[<span class="st">"day"</span>]]).toarray()</span>
+<span id="cb8-227"><a href="#cb8-227" aria-hidden="true" tabindex="-1"></a>encoded_day_df <span class="op">=</span> pd.DataFrame(encoded_day, columns<span class="op">=</span>ohe.get_feature_names_out())</span>
+<span id="cb8-228"><a href="#cb8-228" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-229"><a href="#cb8-229" aria-hidden="true" tabindex="-1"></a>encoded_day_df.head()</span>
+<span id="cb8-230"><a href="#cb8-230" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-231"><a href="#cb8-231" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-232"><a href="#cb8-232" aria-hidden="true" tabindex="-1"></a>The one-hot encoded features can then be used in the design matrix to train a model:</span>
+<span id="cb8-233"><a href="#cb8-233" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-234"><a href="#cb8-234" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/ohemodel.png" alt='ohemodel' width='600'&gt;&lt;/center&gt;</span>
+<span id="cb8-235"><a href="#cb8-235" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-236"><a href="#cb8-236" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_1 (\text{total}<span class="sc">\_</span>\text{bill}) + \theta_2 (\text{size}) + \theta_3 (\text{day}<span class="sc">\_</span>\text{Fri}) + \theta_4 (\text{day}<span class="sc">\_</span>\text{Sat}) + \theta_5 (\text{day}<span class="sc">\_</span>\text{Sun}) + \theta_6 (\text{day}<span class="sc">\_</span>\text{Thur})$$</span>
+<span id="cb8-237"><a href="#cb8-237" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-238"><a href="#cb8-238" aria-hidden="true" tabindex="-1"></a>Or in shorthand:</span>
+<span id="cb8-239"><a href="#cb8-239" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-240"><a href="#cb8-240" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_{1}\phi_{1} + \theta_{2}\phi_{2} + \theta_{3}\phi_{3} + \theta_{4}\phi_{4} + \theta_{5}\phi_{5} + \theta_{6}\phi_{6}$$</span>
+<span id="cb8-241"><a href="#cb8-241" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-242"><a href="#cb8-242" aria-hidden="true" tabindex="-1"></a>Now, the <span class="in">`day`</span> feature (or rather, the four new boolean features that represent day) can be used to fit a model.</span>
+<span id="cb8-243"><a href="#cb8-243" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-244"><a href="#cb8-244" aria-hidden="true" tabindex="-1"></a>Using <span class="in">`sklearn`</span> to fit the new model, we can determine the model coefficients, allowing us to understand how each feature impacts the predicted tip.</span>
+<span id="cb8-245"><a href="#cb8-245" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-248"><a href="#cb8-248" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb8-249"><a href="#cb8-249" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb8-250"><a href="#cb8-250" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb8-251"><a href="#cb8-251" aria-hidden="true" tabindex="-1"></a>data_w_ohe <span class="op">=</span> tips[[<span class="st">"total_bill"</span>, <span class="st">"size"</span>, <span class="st">"day"</span>]].join(encoded_day_df).drop(columns <span class="op">=</span> <span class="st">"day"</span>)</span>
+<span id="cb8-252"><a href="#cb8-252" aria-hidden="true" tabindex="-1"></a>ohe_model <span class="op">=</span> lm.LinearRegression(fit_intercept<span class="op">=</span><span class="va">False</span>) <span class="co">#Tell sklearn to not add an additional bias column. Why?</span></span>
+<span id="cb8-253"><a href="#cb8-253" aria-hidden="true" tabindex="-1"></a>ohe_model.fit(data_w_ohe, tips[<span class="st">"tip"</span>])</span>
+<span id="cb8-254"><a href="#cb8-254" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-255"><a href="#cb8-255" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"Feature"</span>:data_w_ohe.columns, <span class="st">"Model Coefficient"</span>:ohe_model.coef_})</span>
+<span id="cb8-256"><a href="#cb8-256" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-257"><a href="#cb8-257" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-258"><a href="#cb8-258" aria-hidden="true" tabindex="-1"></a>For example, when looking at the coefficient for <span class="in">`day_Fri`</span>, we can now understand the impact of it being Friday on the predicted tip — if it is a Friday, the predicted tip increases by approximately $0.75.</span>
+<span id="cb8-259"><a href="#cb8-259" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-260"><a href="#cb8-260" aria-hidden="true" tabindex="-1"></a>When one-hot encoding, keep in mind that any set of one-hot encoded columns will always sum to a column of all ones, representing the bias column. More formally, the bias column is a linear combination of the OHE columns.</span>
+<span id="cb8-261"><a href="#cb8-261" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-262"><a href="#cb8-262" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/bias.png" alt='bias' width='600'&gt;&lt;/center&gt;</span>
+<span id="cb8-263"><a href="#cb8-263" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-264"><a href="#cb8-264" aria-hidden="true" tabindex="-1"></a>We must be careful not to include this bias column in our design matrix. Otherwise, there will be linear dependence in the model, meaning $\mathbb{X}^{\top}\mathbb{X}$ would no longer be invertible, and our OLS estimate $\hat{\theta} = (\mathbb{X}^{\top}\mathbb{X})^{-1}\mathbb{X}^{\top}\mathbb{Y}$ fails.</span>
+<span id="cb8-265"><a href="#cb8-265" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-266"><a href="#cb8-266" aria-hidden="true" tabindex="-1"></a>To resolve this issue, we simply omit one of the one-hot encoded columns *or* do not include an intercept term. The adjusted design matrices are shown below.</span>
+<span id="cb8-267"><a href="#cb8-267" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-268"><a href="#cb8-268" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/remove.png" alt='remove' width='600'&gt;&lt;/center&gt;</span>
+<span id="cb8-269"><a href="#cb8-269" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-270"><a href="#cb8-270" aria-hidden="true" tabindex="-1"></a>Either approach works — we still retain the same information as the omitted column being a linear combination of the remaining columns.</span>
+<span id="cb8-271"><a href="#cb8-271" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-272"><a href="#cb8-272" aria-hidden="true" tabindex="-1"></a><span class="fu">## Polynomial Features</span></span>
+<span id="cb8-273"><a href="#cb8-273" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-274"><a href="#cb8-274" aria-hidden="true" tabindex="-1"></a>We have encountered a few cases now where models with linear features have performed poorly on datasets that show clear non-linear curvature. </span>
+<span id="cb8-275"><a href="#cb8-275" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-276"><a href="#cb8-276" aria-hidden="true" tabindex="-1"></a>As an example, consider the <span class="in">`vehicles`</span> dataset, which contains information about cars. Suppose we want to use the <span class="in">`hp`</span> (horsepower) of a car to predict its <span class="in">`"mpg"`</span> (gas mileage in miles per gallon). If we visualize the relationship between these two variables, we see a non-linear curvature. Fitting a linear model to these variables results in a high (poor) value of RMSE. </span>
+<span id="cb8-277"><a href="#cb8-277" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-278"><a href="#cb8-278" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0 + \theta_1 (\text{hp})$$</span>
+<span id="cb8-279"><a href="#cb8-279" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-282"><a href="#cb8-282" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb8-283"><a href="#cb8-283" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb8-284"><a href="#cb8-284" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb8-285"><a href="#cb8-285" aria-hidden="true" tabindex="-1"></a>pd.options.mode.chained_assignment <span class="op">=</span> <span class="va">None</span> </span>
+<span id="cb8-286"><a href="#cb8-286" aria-hidden="true" tabindex="-1"></a>vehicles <span class="op">=</span> sns.load_dataset(<span class="st">"mpg"</span>).dropna().rename(columns <span class="op">=</span> {<span class="st">"horsepower"</span>: <span class="st">"hp"</span>}).sort_values(<span class="st">"hp"</span>)</span>
+<span id="cb8-287"><a href="#cb8-287" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-288"><a href="#cb8-288" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> vehicles[[<span class="st">"hp"</span>]]</span>
+<span id="cb8-289"><a href="#cb8-289" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> vehicles[<span class="st">"mpg"</span>]</span>
+<span id="cb8-290"><a href="#cb8-290" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-291"><a href="#cb8-291" aria-hidden="true" tabindex="-1"></a>hp_model <span class="op">=</span> lm.LinearRegression()</span>
+<span id="cb8-292"><a href="#cb8-292" aria-hidden="true" tabindex="-1"></a>hp_model.fit(X, Y)</span>
+<span id="cb8-293"><a href="#cb8-293" aria-hidden="true" tabindex="-1"></a>hp_model_predictions <span class="op">=</span> hp_model.predict(X)</span>
+<span id="cb8-294"><a href="#cb8-294" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-295"><a href="#cb8-295" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb8-296"><a href="#cb8-296" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-297"><a href="#cb8-297" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(data<span class="op">=</span>vehicles, x<span class="op">=</span><span class="st">"hp"</span>, y<span class="op">=</span><span class="st">"mpg"</span>)</span>
+<span id="cb8-298"><a href="#cb8-298" aria-hidden="true" tabindex="-1"></a>plt.plot(vehicles[<span class="st">"hp"</span>], hp_model_predictions, c<span class="op">=</span><span class="st">"tab:red"</span>)<span class="op">;</span></span>
+<span id="cb8-299"><a href="#cb8-299" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-300"><a href="#cb8-300" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"MSE of model with (hp) feature: </span><span class="sc">{</span>np<span class="sc">.</span>mean((Y<span class="op">-</span>hp_model_predictions)<span class="op">**</span><span class="dv">2</span>)<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb8-301"><a href="#cb8-301" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-302"><a href="#cb8-302" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-303"><a href="#cb8-303" aria-hidden="true" tabindex="-1"></a>As we can see from the plot, the data follows a curved line rather than a straight one. To capture this non-linearity, we can incorporate **non-linear** features. Let's introduce a **polynomial** term, $\text{hp}^2$, into our regression model. The model now takes the form:</span>
+<span id="cb8-304"><a href="#cb8-304" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-305"><a href="#cb8-305" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)$$</span>
+<span id="cb8-306"><a href="#cb8-306" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2$$</span>
+<span id="cb8-307"><a href="#cb8-307" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-308"><a href="#cb8-308" aria-hidden="true" tabindex="-1"></a>How can we fit a model with non-linear features? We can use the exact same techniques as before: ordinary least squares, gradient descent, or <span class="in">`sklearn`</span>. This is because our new model is still a **linear model**. Although it contains non-linear *features*, it is linear with respect to the model *parameters*. All of our previous work on fitting models was done under the assumption that we were working with linear models. Because our new model is still linear, we can apply our existing methods to determine the optimal parameters. </span>
+<span id="cb8-309"><a href="#cb8-309" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-312"><a href="#cb8-312" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb8-313"><a href="#cb8-313" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb8-314"><a href="#cb8-314" aria-hidden="true" tabindex="-1"></a><span class="co"># Add a hp^2 feature to the design matrix</span></span>
+<span id="cb8-315"><a href="#cb8-315" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> vehicles[[<span class="st">"hp"</span>]]</span>
+<span id="cb8-316"><a href="#cb8-316" aria-hidden="true" tabindex="-1"></a>X[<span class="st">"hp^2"</span>] <span class="op">=</span> vehicles[<span class="st">"hp"</span>]<span class="op">**</span><span class="dv">2</span></span>
+<span id="cb8-317"><a href="#cb8-317" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-318"><a href="#cb8-318" aria-hidden="true" tabindex="-1"></a><span class="co"># Use sklearn to fit the model</span></span>
+<span id="cb8-319"><a href="#cb8-319" aria-hidden="true" tabindex="-1"></a>hp2_model <span class="op">=</span> lm.LinearRegression()</span>
+<span id="cb8-320"><a href="#cb8-320" aria-hidden="true" tabindex="-1"></a>hp2_model.fit(X, Y)</span>
+<span id="cb8-321"><a href="#cb8-321" aria-hidden="true" tabindex="-1"></a>hp2_model_predictions <span class="op">=</span> hp2_model.predict(X)</span>
+<span id="cb8-322"><a href="#cb8-322" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-323"><a href="#cb8-323" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(data<span class="op">=</span>vehicles, x<span class="op">=</span><span class="st">"hp"</span>, y<span class="op">=</span><span class="st">"mpg"</span>)</span>
+<span id="cb8-324"><a href="#cb8-324" aria-hidden="true" tabindex="-1"></a>plt.plot(vehicles[<span class="st">"hp"</span>], hp2_model_predictions, c<span class="op">=</span><span class="st">"tab:red"</span>)<span class="op">;</span></span>
+<span id="cb8-325"><a href="#cb8-325" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-326"><a href="#cb8-326" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"MSE of model with (hp^2) feature: </span><span class="sc">{</span>np<span class="sc">.</span>mean((Y<span class="op">-</span>hp2_model_predictions)<span class="op">**</span><span class="dv">2</span>)<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb8-327"><a href="#cb8-327" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-328"><a href="#cb8-328" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-329"><a href="#cb8-329" aria-hidden="true" tabindex="-1"></a>Looking a lot better! By incorporating a squared feature, we are able to capture the curvature of the dataset. Our model is now a parabola centered on our data. Notice that our new model's error has decreased relative to the original model with linear features.</span>
+<span id="cb8-330"><a href="#cb8-330" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-331"><a href="#cb8-331" aria-hidden="true" tabindex="-1"></a><span class="fu">## Complexity and Overfitting</span></span>
+<span id="cb8-332"><a href="#cb8-332" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-333"><a href="#cb8-333" aria-hidden="true" tabindex="-1"></a>We've seen now that feature engineering allows us to build all sorts of features to improve the performance of the model. In particular, we saw that designing a more complex feature (squaring <span class="in">`hp`</span> in the <span class="in">`vehicles`</span> data previously) substantially improved the model's ability to capture non-linear relationships. To take full advantage of this, we might be inclined to design increasingly complex features. Consider the following three models, each of different order (the maximum exponent power of each model):</span>
+<span id="cb8-334"><a href="#cb8-334" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-335"><a href="#cb8-335" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Model with order 2: $\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)$</span>
+<span id="cb8-336"><a href="#cb8-336" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Model with order 3: $\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3)$</span>
+<span id="cb8-337"><a href="#cb8-337" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Model with order 4: $\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3) + \theta_4 (\text{hp}^4)$</span>
+<span id="cb8-338"><a href="#cb8-338" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-339"><a href="#cb8-339" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/degree_comparison.png" alt='degree_comparison' width='900'&gt;&lt;/center&gt;</span>
+<span id="cb8-340"><a href="#cb8-340" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-341"><a href="#cb8-341" aria-hidden="true" tabindex="-1"></a>As we can see in the plots above, MSE continues to decrease with each additional polynomial term. To visualize it further, let's plot models as the complexity increases from 0 to 7: </span>
+<span id="cb8-342"><a href="#cb8-342" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-343"><a href="#cb8-343" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/degree_comparison2.png" alt='degree_comparison' width='850'&gt;&lt;/center&gt;</span>
+<span id="cb8-344"><a href="#cb8-344" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-345"><a href="#cb8-345" aria-hidden="true" tabindex="-1"></a>When we use our model to make predictions on the same data that was used to fit the model, we find that the MSE decreases with each additional polynomial term (as our model gets more complex). The **training error** is the model's error when generating predictions from the same data that was used for training purposes. We can conclude that the training error goes down as the complexity of the model increases. </span>
+<span id="cb8-346"><a href="#cb8-346" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-347"><a href="#cb8-347" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/train_error.png" alt='train_error' width='400'&gt;&lt;/center&gt;</span>
+<span id="cb8-348"><a href="#cb8-348" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-349"><a href="#cb8-349" aria-hidden="true" tabindex="-1"></a>This seems like good news – when working on the **training data**, we can improve model performance by designing increasingly complex models. </span>
+<span id="cb8-350"><a href="#cb8-350" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-351"><a href="#cb8-351" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip}</span>
+<span id="cb8-352"><a href="#cb8-352" aria-hidden="true" tabindex="-1"></a><span class="fu">## Math Fact: Polynomial Degrees</span></span>
+<span id="cb8-353"><a href="#cb8-353" aria-hidden="true" tabindex="-1"></a>Given $N$ overlapping data points, we can always find a polynomial of degree $N-1$ that goes through all those points.</span>
+<span id="cb8-354"><a href="#cb8-354" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-355"><a href="#cb8-355" aria-hidden="true" tabindex="-1"></a>For example, there always exists a degree-4 polynomial curve that can perfectly model a dataset of 5 datapoints:</span>
+<span id="cb8-356"><a href="#cb8-356" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/perfect_poly_fits.png" alt='train_error' width='600'&gt;&lt;/center&gt;</span>
+<span id="cb8-357"><a href="#cb8-357" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb8-358"><a href="#cb8-358" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-359"><a href="#cb8-359" aria-hidden="true" tabindex="-1"></a>However, high model complexity comes with its own set of issues. When building the <span class="in">`vehicles`</span> models above, we trained the models on the *entire* dataset and then evaluated their performance on this same dataset. In reality, we are likely to instead train the model on a *sample* from the population, then use it to make predictions on data it didn't encounter during training. </span>
+<span id="cb8-360"><a href="#cb8-360" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-361"><a href="#cb8-361" aria-hidden="true" tabindex="-1"></a>Let's walk through a more realistic example. Say we are given a training dataset of just 6 datapoints and want to train a model to then make predictions on a *different* set of points. We may be tempted to make a highly complex model (e.g., degree 5), especially given it makes perfect predictions on the training data as clear on the left. However, as shown in the graph on the right, this model would perform *horribly* on the rest of the population! </span>
+<span id="cb8-362"><a href="#cb8-362" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-363"><a href="#cb8-363" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/complex.png" alt='complex' width='600'&gt;&lt;/center&gt;</span>
+<span id="cb8-364"><a href="#cb8-364" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-365"><a href="#cb8-365" aria-hidden="true" tabindex="-1"></a>This phenomenon called **overfitting**. The model effectively just memorized the training data it encountered when it was fitted, leaving it unable to **generalize** well to data it didn't encounter during training. This is a problem: we want models that are generalizable to “unseen” data.</span>
+<span id="cb8-366"><a href="#cb8-366" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-367"><a href="#cb8-367" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-368"><a href="#cb8-368" aria-hidden="true" tabindex="-1"></a>Additionally, since complex models are sensitive to the specific dataset used to train them, they have high **variance**. A model with high variance tends to *vary* more dramatically when trained on different datasets. Going back to our example above, we can see our degree-5 model varies erratically when we fit it to different samples of 6 points from <span class="in">`vehicles`</span>. </span>
+<span id="cb8-369"><a href="#cb8-369" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-370"><a href="#cb8-370" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/resamples.png" alt='resamples' width='800'&gt;&lt;/center&gt;</span>
+<span id="cb8-371"><a href="#cb8-371" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-372"><a href="#cb8-372" aria-hidden="true" tabindex="-1"></a>We now face a dilemma: we know that we can **decrease training error** by increasing model complexity, but models that are *too* complex start to overfit and can't be reapplied to new datasets due to **high variance**.</span>
+<span id="cb8-373"><a href="#cb8-373" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-374"><a href="#cb8-374" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/bvt.png" alt='bvt' width='400'&gt;&lt;/center&gt;</span>
+<span id="cb8-375"><a href="#cb8-375" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-376"><a href="#cb8-376" aria-hidden="true" tabindex="-1"></a>We can see that there is a clear trade-off that comes from the complexity of our model. As model complexity increases, the model's error on the training data decreases. At the same time, the model's variance tends to increase.</span>
+<span id="cb8-377"><a href="#cb8-377" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-378"><a href="#cb8-378" aria-hidden="true" tabindex="-1"></a>The takeaway here: we need to strike a balance in the complexity of our models; we want models that are generalizable to "unseen" data. A model that is too simple won't be able to capture the key relationships between our variables of interest; a model that is too complex runs the risk of overfitting. </span>
+<span id="cb8-379"><a href="#cb8-379" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-380"><a href="#cb8-380" aria-hidden="true" tabindex="-1"></a>This begs the question: how do we control the complexity of a model? Stay tuned for Lecture 17 on Cross-Validation and Regularization!</span>
+<span id="cb8-381"><a href="#cb8-381" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-382"><a href="#cb8-382" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-383"><a href="#cb8-383" aria-hidden="true" tabindex="-1"></a><span class="fu">## [Bonus] Stochastic Gradient Descent in `PyTorch`</span></span>
+<span id="cb8-384"><a href="#cb8-384" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-385"><a href="#cb8-385" aria-hidden="true" tabindex="-1"></a>While this material is out of scope for Data 100, it is useful if you plan to enter a career in data science!</span>
+<span id="cb8-386"><a href="#cb8-386" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-387"><a href="#cb8-387" aria-hidden="true" tabindex="-1"></a>In practice, you will use software packages such as <span class="in">`PyTorch`</span> when computing gradients and implementing gradient descent. You'll often follow three main steps:</span>
+<span id="cb8-388"><a href="#cb8-388" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-389"><a href="#cb8-389" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Sample a batch of the data.</span>
+<span id="cb8-390"><a href="#cb8-390" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Compute the loss and the gradient.</span>
+<span id="cb8-391"><a href="#cb8-391" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Update your gradient until you reach an appropriate estimate of the true gradient.</span>
+<span id="cb8-392"><a href="#cb8-392" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-393"><a href="#cb8-393" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/pytorchsgd.png" alt='pytorch_sgd' width='500'&gt;&lt;/center&gt;</span>
+<span id="cb8-394"><a href="#cb8-394" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-395"><a href="#cb8-395" aria-hidden="true" tabindex="-1"></a>If you want to learn more, this <span class="co">[</span><span class="ot">Intro to PyTorch tutorial</span><span class="co">](https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html)</span> is a great resource to get started! </span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/feature_engineering/feature_engineering_files/figure-html/cell-5-output-2.png b/docs/feature_engineering/feature_engineering_files/figure-html/cell-5-output-2.png
new file mode 100644
index 000000000..76bc711df
Binary files /dev/null and b/docs/feature_engineering/feature_engineering_files/figure-html/cell-5-output-2.png differ
diff --git a/docs/feature_engineering/feature_engineering_files/figure-html/cell-6-output-2.png b/docs/feature_engineering/feature_engineering_files/figure-html/cell-6-output-2.png
new file mode 100644
index 000000000..c6034932f
Binary files /dev/null and b/docs/feature_engineering/feature_engineering_files/figure-html/cell-6-output-2.png differ
diff --git a/docs/feature_engineering/images/bias.png b/docs/feature_engineering/images/bias.png
new file mode 100644
index 000000000..e6455ca22
Binary files /dev/null and b/docs/feature_engineering/images/bias.png differ
diff --git a/docs/feature_engineering/images/bvt.png b/docs/feature_engineering/images/bvt.png
new file mode 100644
index 000000000..7baffea82
Binary files /dev/null and b/docs/feature_engineering/images/bvt.png differ
diff --git a/docs/feature_engineering/images/complex.png b/docs/feature_engineering/images/complex.png
new file mode 100644
index 000000000..61769f1a3
Binary files /dev/null and b/docs/feature_engineering/images/complex.png differ
diff --git a/docs/feature_engineering/images/complexity_grad_descent.png b/docs/feature_engineering/images/complexity_grad_descent.png
new file mode 100644
index 000000000..8a48dbbe4
Binary files /dev/null and b/docs/feature_engineering/images/complexity_grad_descent.png differ
diff --git a/docs/feature_engineering/images/complexity_normal_solution.png b/docs/feature_engineering/images/complexity_normal_solution.png
new file mode 100644
index 000000000..c41ad6a7a
Binary files /dev/null and b/docs/feature_engineering/images/complexity_normal_solution.png differ
diff --git a/docs/feature_engineering/images/degree_comparison.png b/docs/feature_engineering/images/degree_comparison.png
new file mode 100644
index 000000000..9bb1992e7
Binary files /dev/null and b/docs/feature_engineering/images/degree_comparison.png differ
diff --git a/docs/feature_engineering/images/degree_comparison2.png b/docs/feature_engineering/images/degree_comparison2.png
new file mode 100644
index 000000000..95ee200a0
Binary files /dev/null and b/docs/feature_engineering/images/degree_comparison2.png differ
diff --git a/docs/feature_engineering/images/gd.png b/docs/feature_engineering/images/gd.png
new file mode 100644
index 000000000..6ba0c3376
Binary files /dev/null and b/docs/feature_engineering/images/gd.png differ
diff --git a/docs/feature_engineering/images/ohe.png b/docs/feature_engineering/images/ohe.png
new file mode 100644
index 000000000..c5f26296c
Binary files /dev/null and b/docs/feature_engineering/images/ohe.png differ
diff --git a/docs/feature_engineering/images/ohemodel.png b/docs/feature_engineering/images/ohemodel.png
new file mode 100644
index 000000000..06dddaea7
Binary files /dev/null and b/docs/feature_engineering/images/ohemodel.png differ
diff --git a/docs/feature_engineering/images/perfect_poly_fits.png b/docs/feature_engineering/images/perfect_poly_fits.png
new file mode 100644
index 000000000..86943ecfc
Binary files /dev/null and b/docs/feature_engineering/images/perfect_poly_fits.png differ
diff --git a/docs/feature_engineering/images/phi.png b/docs/feature_engineering/images/phi.png
new file mode 100644
index 000000000..4c0b04e91
Binary files /dev/null and b/docs/feature_engineering/images/phi.png differ
diff --git a/docs/feature_engineering/images/pytorchsgd.png b/docs/feature_engineering/images/pytorchsgd.png
new file mode 100644
index 000000000..85b07dbcd
Binary files /dev/null and b/docs/feature_engineering/images/pytorchsgd.png differ
diff --git a/docs/feature_engineering/images/remove.png b/docs/feature_engineering/images/remove.png
new file mode 100644
index 000000000..bd09ddcf1
Binary files /dev/null and b/docs/feature_engineering/images/remove.png differ
diff --git a/docs/feature_engineering/images/resamples.png b/docs/feature_engineering/images/resamples.png
new file mode 100644
index 000000000..28f904ab1
Binary files /dev/null and b/docs/feature_engineering/images/resamples.png differ
diff --git a/docs/feature_engineering/images/sgd.png b/docs/feature_engineering/images/sgd.png
new file mode 100644
index 000000000..ee579a100
Binary files /dev/null and b/docs/feature_engineering/images/sgd.png differ
diff --git a/docs/feature_engineering/images/train_error.png b/docs/feature_engineering/images/train_error.png
new file mode 100644
index 000000000..a2993b42b
Binary files /dev/null and b/docs/feature_engineering/images/train_error.png differ
diff --git a/docs/gradient_descent/gradient_descent.html b/docs/gradient_descent/gradient_descent.html
new file mode 100644
index 000000000..637833192
--- /dev/null
+++ b/docs/gradient_descent/gradient_descent.html
@@ -0,0 +1,3108 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>13&nbsp; sklearn and Gradient Descent – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../feature_engineering/feature_engineering.html" rel="next">
+<link href="../ols/ols.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+<script type="text/javascript">
+window.PlotlyConfig = {MathJaxConfig: 'local'};
+if (window.MathJax && window.MathJax.Hub && window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: "STIX-Web"}});}
+if (typeof require !== 'undefined') {
+require.undef("plotly");
+requirejs.config({
+    paths: {
+        'plotly': ['https://cdn.plot.ly/plotly-2.34.0.min']
+    }
+});
+require(['plotly'], function(Plotly) {
+    window._Plotly = Plotly;
+});
+}
+</script>
+
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../gradient_descent/gradient_descent.html"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">sklearn and Gradient Descent</h2>
+   
+  <ul>
+  <li><a href="#sklearn" id="toc-sklearn" class="nav-link active" data-scroll-target="#sklearn"><span class="header-section-number">13.1</span> <code>sklearn</code></a>
+  <ul>
+  <li><a href="#implementing-derived-formulas-in-code" id="toc-implementing-derived-formulas-in-code" class="nav-link" data-scroll-target="#implementing-derived-formulas-in-code"><span class="header-section-number">13.1.1</span> Implementing Derived Formulas in Code</a></li>
+  <li><a href="#the-sklearn-workflow" id="toc-the-sklearn-workflow" class="nav-link" data-scroll-target="#the-sklearn-workflow"><span class="header-section-number">13.1.2</span> The <code>sklearn</code> Workflow</a></li>
+  </ul></li>
+  <li><a href="#gradient-descent" id="toc-gradient-descent" class="nav-link" data-scroll-target="#gradient-descent"><span class="header-section-number">13.2</span> Gradient Descent</a>
+  <ul>
+  <li><a href="#minimizing-an-arbitrary-1d-function" id="toc-minimizing-an-arbitrary-1d-function" class="nav-link" data-scroll-target="#minimizing-an-arbitrary-1d-function"><span class="header-section-number">13.2.1</span> Minimizing an Arbitrary 1D Function</a>
+  <ul>
+  <li><a href="#the-naive-approach-guess-and-check" id="toc-the-naive-approach-guess-and-check" class="nav-link" data-scroll-target="#the-naive-approach-guess-and-check"><span class="header-section-number">13.2.1.1</span> The Naive Approach: Guess and Check</a></li>
+  <li><a href="#scipy.optimize.minimize" id="toc-scipy.optimize.minimize" class="nav-link" data-scroll-target="#scipy.optimize.minimize"><span class="header-section-number">13.2.1.2</span> <code>Scipy.optimize.minimize</code></a></li>
+  <li><a href="#digging-into-gradient-descent" id="toc-digging-into-gradient-descent" class="nav-link" data-scroll-target="#digging-into-gradient-descent"><span class="header-section-number">13.2.1.3</span> Digging into Gradient Descent</a></li>
+  <li><a href="#algorithm-attempt-1" id="toc-algorithm-attempt-1" class="nav-link" data-scroll-target="#algorithm-attempt-1"><span class="header-section-number">13.2.1.4</span> Algorithm Attempt 1</a></li>
+  <li><a href="#algorithm-attempt-2" id="toc-algorithm-attempt-2" class="nav-link" data-scroll-target="#algorithm-attempt-2"><span class="header-section-number">13.2.1.5</span> Algorithm Attempt 2</a></li>
+  </ul></li>
+  <li><a href="#convexity" id="toc-convexity" class="nav-link" data-scroll-target="#convexity"><span class="header-section-number">13.2.2</span> Convexity</a></li>
+  <li><a href="#gradient-descent-in-1-dimension" id="toc-gradient-descent-in-1-dimension" class="nav-link" data-scroll-target="#gradient-descent-in-1-dimension"><span class="header-section-number">13.2.3</span> Gradient Descent in 1 Dimension</a>
+  <ul>
+  <li><a href="#gradient-descent-on-the-tips-dataset" id="toc-gradient-descent-on-the-tips-dataset" class="nav-link" data-scroll-target="#gradient-descent-on-the-tips-dataset"><span class="header-section-number">13.2.3.1</span> Gradient Descent on the <code>tips</code> Dataset</a></li>
+  </ul></li>
+  <li><a href="#gradient-descent-on-multi-dimensional-models" id="toc-gradient-descent-on-multi-dimensional-models" class="nav-link" data-scroll-target="#gradient-descent-on-multi-dimensional-models"><span class="header-section-number">13.2.4</span> Gradient Descent on Multi-Dimensional Models</a>
+  <ul>
+  <li><a href="#the-gradient-vector" id="toc-the-gradient-vector" class="nav-link" data-scroll-target="#the-gradient-vector"><span class="header-section-number">13.2.4.1</span> The Gradient Vector</a></li>
+  </ul></li>
+  <li><a href="#batch-gradient-descent-and-stochastic-gradient-descent" id="toc-batch-gradient-descent-and-stochastic-gradient-descent" class="nav-link" data-scroll-target="#batch-gradient-descent-and-stochastic-gradient-descent"><span class="header-section-number">13.2.5</span> Batch Gradient Descent and Stochastic Gradient Descent</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Apply the <code>sklearn</code> library for model creation and training</li>
+<li>Optimizing complex models</li>
+<li>Identifying cases where straight calculus or geometric arguments won’t help solve the loss function</li>
+<li>Applying gradient descent for numerical optimization</li>
+</ul>
+</div>
+</div>
+</div>
+<div id="44dca420" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.express <span class="im">as</span> px</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>pd.options.mode.chained_assignment <span class="op">=</span> <span class="va">None</span>  <span class="co"># default='warn'</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<section id="sklearn" class="level2" data-number="13.1">
+<h2 data-number="13.1" class="anchored" data-anchor-id="sklearn"><span class="header-section-number">13.1</span> <code>sklearn</code></h2>
+<section id="implementing-derived-formulas-in-code" class="level3" data-number="13.1.1">
+<h3 data-number="13.1.1" class="anchored" data-anchor-id="implementing-derived-formulas-in-code"><span class="header-section-number">13.1.1</span> Implementing Derived Formulas in Code</h3>
+<p>Throughout this lecture, we’ll refer to the <code>penguins</code> dataset.</p>
+<div id="60bff95e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>penguins <span class="op">=</span> sns.load_dataset(<span class="st">"penguins"</span>)</span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>penguins <span class="op">=</span> penguins[penguins[<span class="st">"species"</span>] <span class="op">==</span> <span class="st">"Adelie"</span>].dropna()</span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>penguins.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="2">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">species</th>
+<th data-quarto-table-cell-role="th">island</th>
+<th data-quarto-table-cell-role="th">bill_length_mm</th>
+<th data-quarto-table-cell-role="th">bill_depth_mm</th>
+<th data-quarto-table-cell-role="th">flipper_length_mm</th>
+<th data-quarto-table-cell-role="th">body_mass_g</th>
+<th data-quarto-table-cell-role="th">sex</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Adelie</td>
+<td>Torgersen</td>
+<td>39.1</td>
+<td>18.7</td>
+<td>181.0</td>
+<td>3750.0</td>
+<td>Male</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Adelie</td>
+<td>Torgersen</td>
+<td>39.5</td>
+<td>17.4</td>
+<td>186.0</td>
+<td>3800.0</td>
+<td>Female</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Adelie</td>
+<td>Torgersen</td>
+<td>40.3</td>
+<td>18.0</td>
+<td>195.0</td>
+<td>3250.0</td>
+<td>Female</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">4</td>
+<td>Adelie</td>
+<td>Torgersen</td>
+<td>36.7</td>
+<td>19.3</td>
+<td>193.0</td>
+<td>3450.0</td>
+<td>Female</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">5</td>
+<td>Adelie</td>
+<td>Torgersen</td>
+<td>39.3</td>
+<td>20.6</td>
+<td>190.0</td>
+<td>3650.0</td>
+<td>Male</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Our goal will be to predict the value of the <code>"bill_depth_mm"</code> for a particular penguin given its <code>"flipper_length_mm"</code> and <code>"body_mass_g"</code>. We’ll also add a bias column of all ones to represent the intercept term of our models.</p>
+<div id="30708506" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Add a bias column of all ones to `penguins`</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>penguins[<span class="st">"bias"</span>] <span class="op">=</span> np.ones(<span class="bu">len</span>(penguins), dtype<span class="op">=</span><span class="bu">int</span>) </span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Define the design matrix, X...</span></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Note that we use .to_numpy() to convert our DataFrame into a NumPy array so it is in Matrix form</span></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> penguins[[<span class="st">"bias"</span>, <span class="st">"flipper_length_mm"</span>, <span class="st">"body_mass_g"</span>]].to_numpy()</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="co"># ...as well as the target variable, Y</span></span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Again, we use .to_numpy() to convert our DataFrame into a NumPy array so it is in Matrix form</span></span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> penguins[[<span class="st">"bill_depth_mm"</span>]].to_numpy()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>In the lecture on ordinary least squares, we expressed multiple linear regression using matrix notation.</p>
+<p><span class="math display">\[\hat{\mathbb{Y}} = \mathbb{X}\theta\]</span></p>
+<p>We used a geometric approach to derive the following expression for the optimal model parameters:</p>
+<p><span class="math display">\[\hat{\theta} = (\mathbb{X}^T \mathbb{X})^{-1}\mathbb{X}^T \mathbb{Y}\]</span></p>
+<p>That’s a whole lot of matrix manipulation. How do we implement it in <code>python</code>?</p>
+<p>There are three operations we need to perform here: multiplying matrices, taking transposes, and finding inverses.</p>
+<ul>
+<li>To perform matrix multiplication, use the <code>@</code> operator</li>
+<li>To take a transpose, call the <code>.T</code> attribute of an <code>NumPy</code> array or <code>DataFrame</code></li>
+<li>To compute an inverse, use <code>NumPy</code>’s in-built method <code>np.linalg.inv</code></li>
+</ul>
+<p>Putting this all together, we can compute the OLS estimate for the optimal model parameters, stored in the array <code>theta_hat</code>.</p>
+<div id="819b975e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>theta_hat <span class="op">=</span> np.linalg.inv(X.T <span class="op">@</span> X) <span class="op">@</span> X.T <span class="op">@</span> Y</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>theta_hat</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<pre><code>array([[1.10029953e+01],
+       [9.82848689e-03],
+       [1.47749591e-03]])</code></pre>
+</div>
+</div>
+<p>To make predictions using our optimized parameter values, we matrix-multiply the design matrix with the parameter vector:</p>
+<p><span class="math display">\[\hat{\mathbb{Y}} = \mathbb{X}\theta\]</span></p>
+<div id="95b322f6" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>Y_hat <span class="op">=</span> X <span class="op">@</span> theta_hat</span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>pd.DataFrame(Y_hat).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>18.322561</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>18.445578</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>17.721412</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>17.997254</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>18.263268</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="the-sklearn-workflow" class="level3" data-number="13.1.2">
+<h3 data-number="13.1.2" class="anchored" data-anchor-id="the-sklearn-workflow"><span class="header-section-number">13.1.2</span> The <code>sklearn</code> Workflow</h3>
+<p>We’ve already saved a lot of time (and avoided tedious calculations) by translating our derived formulas into code. However, we still had to go through the process of writing out the linear algebra ourselves.</p>
+<p>To make life <em>even easier</em>, we can turn to the <code>sklearn</code> <a href="https://scikit-learn.org/stable/"><code>python</code> library</a>. <code>sklearn</code> is a robust library of machine learning tools used extensively in research and industry. It is the standard for simple machine learning tasks and gives us a wide variety of in-built modeling frameworks and methods, so we’ll keep returning to <code>sklearn</code> techniques as we progress through Data 100.</p>
+<p>Regardless of the specific type of model being implemented, <code>sklearn</code> follows a standard set of steps for creating a model:</p>
+<ol type="1">
+<li><p>Import the <code>LinearRegression</code> model from <code>sklearn</code></p>
+<pre><code>from sklearn.linear_model import LinearRegression</code></pre></li>
+<li><p>Create a model object. This generates a new instance of the model class. You can think of it as making a new “copy” of a standard “template” for a model. In code, this looks like:</p>
+<pre><code>my_model = LinearRegression()</code></pre></li>
+<li><p>Fit the model to the <code>X</code> design matrix and <code>Y</code> target vector. This calculates the optimal model parameters “behind the scenes” without us explicitly working through the calculations ourselves. The fitted parameters are then stored within the model for use in future predictions:</p>
+<pre><code>my_model.fit(X, Y)</code></pre></li>
+<li><p>Use the fitted model to make predictions on the <code>X</code> input data using <code>.predict</code>.</p>
+<pre><code>my_model.predict(X)</code></pre></li>
+</ol>
+<p>To extract the fitted parameters, we can use:</p>
+<pre><code>my_model.coef_
+
+my_model.intercept_</code></pre>
+<p>Let’s put this into action with our multiple regression task!</p>
+<p><strong>1. Initialize an instance of the model class</strong></p>
+<p><code>sklearn</code> stores “templates” of useful models for machine learning. We begin the modeling process by making a “copy” of one of these templates for our own use. Model initialization looks like <code>ModelClass()</code>, where <code>ModelClass</code> is the type of model we wish to create.</p>
+<p>For now, let’s create a linear regression model using <code>LinearRegression</code>.</p>
+<p><code>my_model</code> is now an instance of the <code>LinearRegression</code> class. You can think of it as the “idea” of a linear regression model. We haven’t trained it yet, so it doesn’t know any model parameters and cannot be used to make predictions. In fact, we haven’t even told it what data to use for modeling! It simply waits for further instructions.</p>
+<div id="8874d88c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>my_model <span class="op">=</span> LinearRegression()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p><strong>2. Train the model using <code>.fit</code></strong></p>
+<p>Before the model can make predictions, we will need to fit it to our training data. When we fit the model, <code>sklearn</code> will run gradient descent behind the scenes to determine the optimal model parameters. It will then save these model parameters to our model instance for future use.</p>
+<p>All <code>sklearn</code> model classes include a <code>.fit</code> method, which is used to fit the model. It takes in two inputs: the design matrix, <code>X</code>, and the target variable, <code>Y</code>.</p>
+<p>Let’s start by fitting a model with just one feature: the flipper length. We create a design matrix <code>X</code> by pulling out the <code>"flipper_length_mm"</code> column from the <code>DataFrame</code>.</p>
+<div id="cd9ac501" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co"># .fit expects a 2D data design matrix, so we use double brackets to extract a DataFrame</span></span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> penguins[[<span class="st">"flipper_length_mm"</span>]]</span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> penguins[<span class="st">"bill_depth_mm"</span>]</span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a>my_model.fit(X, Y)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<style>#sk-container-id-1 {
+  /* Definition of color scheme common for light and dark mode */
+  --sklearn-color-text: black;
+  --sklearn-color-line: gray;
+  /* Definition of color scheme for unfitted estimators */
+  --sklearn-color-unfitted-level-0: #fff5e6;
+  --sklearn-color-unfitted-level-1: #f6e4d2;
+  --sklearn-color-unfitted-level-2: #ffe0b3;
+  --sklearn-color-unfitted-level-3: chocolate;
+  /* Definition of color scheme for fitted estimators */
+  --sklearn-color-fitted-level-0: #f0f8ff;
+  --sklearn-color-fitted-level-1: #d4ebff;
+  --sklearn-color-fitted-level-2: #b3dbfd;
+  --sklearn-color-fitted-level-3: cornflowerblue;
+
+  /* Specific color for light theme */
+  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
+  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));
+  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
+  --sklearn-color-icon: #696969;
+
+  @media (prefers-color-scheme: dark) {
+    /* Redefinition of color scheme for dark theme */
+    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
+    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));
+    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
+    --sklearn-color-icon: #878787;
+  }
+}
+
+#sk-container-id-1 {
+  color: var(--sklearn-color-text);
+}
+
+#sk-container-id-1 pre {
+  padding: 0;
+}
+
+#sk-container-id-1 input.sk-hidden--visually {
+  border: 0;
+  clip: rect(1px 1px 1px 1px);
+  clip: rect(1px, 1px, 1px, 1px);
+  height: 1px;
+  margin: -1px;
+  overflow: hidden;
+  padding: 0;
+  position: absolute;
+  width: 1px;
+}
+
+#sk-container-id-1 div.sk-dashed-wrapped {
+  border: 1px dashed var(--sklearn-color-line);
+  margin: 0 0.4em 0.5em 0.4em;
+  box-sizing: border-box;
+  padding-bottom: 0.4em;
+  background-color: var(--sklearn-color-background);
+}
+
+#sk-container-id-1 div.sk-container {
+  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`
+     but bootstrap.min.css set `[hidden] { display: none !important; }`
+     so we also need the `!important` here to be able to override the
+     default hidden behavior on the sphinx rendered scikit-learn.org.
+     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */
+  display: inline-block !important;
+  position: relative;
+}
+
+#sk-container-id-1 div.sk-text-repr-fallback {
+  display: none;
+}
+
+div.sk-parallel-item,
+div.sk-serial,
+div.sk-item {
+  /* draw centered vertical line to link estimators */
+  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));
+  background-size: 2px 100%;
+  background-repeat: no-repeat;
+  background-position: center center;
+}
+
+/* Parallel-specific style estimator block */
+
+#sk-container-id-1 div.sk-parallel-item::after {
+  content: "";
+  width: 100%;
+  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);
+  flex-grow: 1;
+}
+
+#sk-container-id-1 div.sk-parallel {
+  display: flex;
+  align-items: stretch;
+  justify-content: center;
+  background-color: var(--sklearn-color-background);
+  position: relative;
+}
+
+#sk-container-id-1 div.sk-parallel-item {
+  display: flex;
+  flex-direction: column;
+}
+
+#sk-container-id-1 div.sk-parallel-item:first-child::after {
+  align-self: flex-end;
+  width: 50%;
+}
+
+#sk-container-id-1 div.sk-parallel-item:last-child::after {
+  align-self: flex-start;
+  width: 50%;
+}
+
+#sk-container-id-1 div.sk-parallel-item:only-child::after {
+  width: 0;
+}
+
+/* Serial-specific style estimator block */
+
+#sk-container-id-1 div.sk-serial {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  background-color: var(--sklearn-color-background);
+  padding-right: 1em;
+  padding-left: 1em;
+}
+
+
+/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is
+clickable and can be expanded/collapsed.
+- Pipeline and ColumnTransformer use this feature and define the default style
+- Estimators will overwrite some part of the style using the `sk-estimator` class
+*/
+
+/* Pipeline and ColumnTransformer style (default) */
+
+#sk-container-id-1 div.sk-toggleable {
+  /* Default theme specific background. It is overwritten whether we have a
+  specific estimator or a Pipeline/ColumnTransformer */
+  background-color: var(--sklearn-color-background);
+}
+
+/* Toggleable label */
+#sk-container-id-1 label.sk-toggleable__label {
+  cursor: pointer;
+  display: block;
+  width: 100%;
+  margin-bottom: 0;
+  padding: 0.5em;
+  box-sizing: border-box;
+  text-align: center;
+}
+
+#sk-container-id-1 label.sk-toggleable__label-arrow:before {
+  /* Arrow on the left of the label */
+  content: "▸";
+  float: left;
+  margin-right: 0.25em;
+  color: var(--sklearn-color-icon);
+}
+
+#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {
+  color: var(--sklearn-color-text);
+}
+
+/* Toggleable content - dropdown */
+
+#sk-container-id-1 div.sk-toggleable__content {
+  max-height: 0;
+  max-width: 0;
+  overflow: hidden;
+  text-align: left;
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-0);
+}
+
+#sk-container-id-1 div.sk-toggleable__content.fitted {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-0);
+}
+
+#sk-container-id-1 div.sk-toggleable__content pre {
+  margin: 0.2em;
+  border-radius: 0.25em;
+  color: var(--sklearn-color-text);
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-0);
+}
+
+#sk-container-id-1 div.sk-toggleable__content.fitted pre {
+  /* unfitted */
+  background-color: var(--sklearn-color-fitted-level-0);
+}
+
+#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {
+  /* Expand drop-down */
+  max-height: 200px;
+  max-width: 100%;
+  overflow: auto;
+}
+
+#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {
+  content: "▾";
+}
+
+/* Pipeline/ColumnTransformer-specific style */
+
+#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  color: var(--sklearn-color-text);
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+/* Estimator-specific style */
+
+/* Colorize estimator box */
+#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+#sk-container-id-1 div.sk-label label.sk-toggleable__label,
+#sk-container-id-1 div.sk-label label {
+  /* The background is the default theme color */
+  color: var(--sklearn-color-text-on-default-background);
+}
+
+/* On hover, darken the color of the background */
+#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {
+  color: var(--sklearn-color-text);
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+/* Label box, darken color on hover, fitted */
+#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {
+  color: var(--sklearn-color-text);
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+/* Estimator label */
+
+#sk-container-id-1 div.sk-label label {
+  font-family: monospace;
+  font-weight: bold;
+  display: inline-block;
+  line-height: 1.2em;
+}
+
+#sk-container-id-1 div.sk-label-container {
+  text-align: center;
+}
+
+/* Estimator-specific */
+#sk-container-id-1 div.sk-estimator {
+  font-family: monospace;
+  border: 1px dotted var(--sklearn-color-border-box);
+  border-radius: 0.25em;
+  box-sizing: border-box;
+  margin-bottom: 0.5em;
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-0);
+}
+
+#sk-container-id-1 div.sk-estimator.fitted {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-0);
+}
+
+/* on hover */
+#sk-container-id-1 div.sk-estimator:hover {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+#sk-container-id-1 div.sk-estimator.fitted:hover {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+/* Specification for estimator info (e.g. "i" and "?") */
+
+/* Common style for "i" and "?" */
+
+.sk-estimator-doc-link,
+a:link.sk-estimator-doc-link,
+a:visited.sk-estimator-doc-link {
+  float: right;
+  font-size: smaller;
+  line-height: 1em;
+  font-family: monospace;
+  background-color: var(--sklearn-color-background);
+  border-radius: 1em;
+  height: 1em;
+  width: 1em;
+  text-decoration: none !important;
+  margin-left: 1ex;
+  /* unfitted */
+  border: var(--sklearn-color-unfitted-level-1) 1pt solid;
+  color: var(--sklearn-color-unfitted-level-1);
+}
+
+.sk-estimator-doc-link.fitted,
+a:link.sk-estimator-doc-link.fitted,
+a:visited.sk-estimator-doc-link.fitted {
+  /* fitted */
+  border: var(--sklearn-color-fitted-level-1) 1pt solid;
+  color: var(--sklearn-color-fitted-level-1);
+}
+
+/* On hover */
+div.sk-estimator:hover .sk-estimator-doc-link:hover,
+.sk-estimator-doc-link:hover,
+div.sk-label-container:hover .sk-estimator-doc-link:hover,
+.sk-estimator-doc-link:hover {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-3);
+  color: var(--sklearn-color-background);
+  text-decoration: none;
+}
+
+div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,
+.sk-estimator-doc-link.fitted:hover,
+div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,
+.sk-estimator-doc-link.fitted:hover {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-3);
+  color: var(--sklearn-color-background);
+  text-decoration: none;
+}
+
+/* Span, style for the box shown on hovering the info icon */
+.sk-estimator-doc-link span {
+  display: none;
+  z-index: 9999;
+  position: relative;
+  font-weight: normal;
+  right: .2ex;
+  padding: .5ex;
+  margin: .5ex;
+  width: min-content;
+  min-width: 20ex;
+  max-width: 50ex;
+  color: var(--sklearn-color-text);
+  box-shadow: 2pt 2pt 4pt #999;
+  /* unfitted */
+  background: var(--sklearn-color-unfitted-level-0);
+  border: .5pt solid var(--sklearn-color-unfitted-level-3);
+}
+
+.sk-estimator-doc-link.fitted span {
+  /* fitted */
+  background: var(--sklearn-color-fitted-level-0);
+  border: var(--sklearn-color-fitted-level-3);
+}
+
+.sk-estimator-doc-link:hover span {
+  display: block;
+}
+
+/* "?"-specific style due to the `<a>` HTML tag */
+
+#sk-container-id-1 a.estimator_doc_link {
+  float: right;
+  font-size: 1rem;
+  line-height: 1em;
+  font-family: monospace;
+  background-color: var(--sklearn-color-background);
+  border-radius: 1rem;
+  height: 1rem;
+  width: 1rem;
+  text-decoration: none;
+  /* unfitted */
+  color: var(--sklearn-color-unfitted-level-1);
+  border: var(--sklearn-color-unfitted-level-1) 1pt solid;
+}
+
+#sk-container-id-1 a.estimator_doc_link.fitted {
+  /* fitted */
+  border: var(--sklearn-color-fitted-level-1) 1pt solid;
+  color: var(--sklearn-color-fitted-level-1);
+}
+
+/* On hover */
+#sk-container-id-1 a.estimator_doc_link:hover {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-3);
+  color: var(--sklearn-color-background);
+  text-decoration: none;
+}
+
+#sk-container-id-1 a.estimator_doc_link.fitted:hover {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-3);
+}
+</a></style><div id="sk-container-id-1" class="sk-top-container"><div class="sk-text-repr-fallback"><pre>LinearRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br>On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden=""><div class="sk-item"><div class="sk-estimator fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" checked=""><label for="sk-estimator-id-1" class="sk-toggleable__label fitted sk-toggleable__label-arrow fitted">&nbsp;&nbsp;LinearRegression<a class="sk-estimator-doc-link fitted" rel="noreferrer" target="_blank" href="https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LinearRegression.html">?<span>Documentation for LinearRegression</span></a><span class="sk-estimator-doc-link fitted">i<span>Fitted</span></span></label><div class="sk-toggleable__content fitted"><pre>LinearRegression()</pre></div> </div></div></div></div>
+</div>
+</div>
+<p>Notice that we use <strong>double brackets</strong> to extract this column. Why double brackets instead of just single brackets? The <code>.fit</code> method, by default, expects to receive <strong>2-dimensional</strong> data – some kind of data that includes both rows and columns. Writing <code>penguins["flipper_length_mm"]</code> would return a 1D <code>Series</code>, causing <code>sklearn</code> to error. We avoid this by writing <code>penguins[["flipper_length_mm"]]</code> to produce a 2D <code>DataFrame</code>.</p>
+<p>And in just three lines of code, our model has run gradient descent to determine the optimal model parameters! Our single-feature model takes the form:</p>
+<p><span class="math display">\[\text{bill depth} = \theta_0 + \theta_1 \text{flipper length}\]</span></p>
+<p>Note that <code>LinearRegression</code> will automatically include an intercept term.</p>
+<p>The fitted model parameters are stored as attributes of the model instance. <code>my_model.intercept_</code> will return the value of <span class="math inline">\(\hat{\theta}_0\)</span> as a scalar. <code>my_model.coef_</code> will return all values <span class="math inline">\(\hat{\theta}_1,
+\hat{\theta}_1, ...\)</span> in an array. Because our model only contains one feature, we see just the value of <span class="math inline">\(\hat{\theta}_1\)</span> in the cell below.</p>
+<div id="fe026a2c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># The intercept term, theta_0</span></span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>my_model.intercept_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<pre><code>np.float64(7.297305899612313)</code></pre>
+</div>
+</div>
+<div id="790d25ba" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="co"># All parameters theta_1, ..., theta_p</span></span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>my_model.coef_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<pre><code>array([0.05812622])</code></pre>
+</div>
+</div>
+<p><strong>3. Use the fitted model to make predictions</strong></p>
+<p>Now that the model has been trained, we can use it to make predictions! To do so, we use the <code>.predict</code> method. <code>.predict</code> takes in one argument: the design matrix that should be used to generate predictions. To understand how the model performs on the training set, we would pass in the training data. Alternatively, to make predictions on unseen data, we would pass in a new dataset that wasn’t used to train the model.</p>
+<p>Below, we call <code>.predict</code> to generate model predictions on the original training data. As before, we use double brackets to ensure that we extract 2-dimensional data.</p>
+<div id="8337270b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>Y_hat_one_feature <span class="op">=</span> my_model.predict(penguins[[<span class="st">"flipper_length_mm"</span>]])</span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"The RMSE of the model is </span><span class="sc">{</span>np<span class="sc">.</span>sqrt(np.mean((Y<span class="op">-</span>Y_hat_one_feature)<span class="op">**</span><span class="dv">2</span>))<span class="sc">}</span><span class="ss">"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>The RMSE of the model is 1.154936309923901</code></pre>
+</div>
+</div>
+<p>What if we wanted a model with two features?</p>
+<p><span class="math display">\[\text{bill depth} = \theta_0 + \theta_1 \text{flipper length} + \theta_2 \text{body mass}\]</span></p>
+<p>We repeat this three-step process by intializing a new model object, then calling <code>.fit</code> and <code>.predict</code> as before.</p>
+<div id="a11175b9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Step 1: initialize LinearRegression model</span></span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>two_feature_model <span class="op">=</span> LinearRegression()</span>
+<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Step 2: fit the model</span></span>
+<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>X_two_features <span class="op">=</span> penguins[[<span class="st">"flipper_length_mm"</span>, <span class="st">"body_mass_g"</span>]]</span>
+<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> penguins[<span class="st">"bill_depth_mm"</span>]</span>
+<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a>two_feature_model.fit(X_two_features, Y)</span>
+<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Step 3: make predictions</span></span>
+<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a>Y_hat_two_features <span class="op">=</span> two_feature_model.predict(X_two_features)</span>
+<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-13"><a href="#cb20-13" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"The RMSE of the model is </span><span class="sc">{</span>np<span class="sc">.</span>sqrt(np.mean((Y<span class="op">-</span>Y_hat_two_features)<span class="op">**</span><span class="dv">2</span>))<span class="sc">}</span><span class="ss">"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>The RMSE of the model is 0.9881331104079043</code></pre>
+</div>
+</div>
+<p>We can also see that we obtain the same predictions using <code>sklearn</code> as we did when applying the ordinary least squares formula before!</p>
+<div id="4ba2d82c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="12">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"Y_hat from OLS"</span>:np.squeeze(Y_hat), <span class="st">"Y_hat from sklearn"</span>:Y_hat_two_features}).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="12">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Y_hat from OLS</th>
+<th data-quarto-table-cell-role="th">Y_hat from sklearn</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>18.322561</td>
+<td>18.322561</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>18.445578</td>
+<td>18.445578</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>17.721412</td>
+<td>17.721412</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>17.997254</td>
+<td>17.997254</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>18.263268</td>
+<td>18.263268</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="gradient-descent" class="level2" data-number="13.2">
+<h2 data-number="13.2" class="anchored" data-anchor-id="gradient-descent"><span class="header-section-number">13.2</span> Gradient Descent</h2>
+<p>At this point, we’ve grown quite familiar with the process of choosing a model and a corresponding loss function and optimizing parameters by choosing the values of <span class="math inline">\(\theta\)</span> that minimize the loss function. So far, we’ve optimized <span class="math inline">\(\theta\)</span> by</p>
+<ol type="1">
+<li>Using calculus to take the derivative of the loss function with respect to <span class="math inline">\(\theta\)</span>, setting it equal to 0, and solving for <span class="math inline">\(\theta\)</span>.</li>
+<li>Using the geometric argument of orthogonality to derive the OLS solution <span class="math inline">\(\hat{\theta} = (\mathbb{X}^T \mathbb{X})^{-1}\mathbb{X}^T \mathbb{Y}\)</span>.</li>
+</ol>
+<p>One thing to note, however, is that the techniques we used above can only be applied if we make some big assumptions. For the calculus approach, we assumed that the loss function was differentiable at all points and that we could algebraically solve for the zero points of the derivative; for the geometric approach, OLS <em>only</em> applies when using a linear model with MSE loss. What happens when we have more complex models with different, more complex loss functions? The techniques we’ve learned so far will not work, so we need a new optimization technique: <strong>gradient descent</strong>.</p>
+<blockquote class="blockquote">
+<p><strong>BIG IDEA</strong>: use an iterative algorithm to numerically compute the minimum of the loss.</p>
+</blockquote>
+<section id="minimizing-an-arbitrary-1d-function" class="level3" data-number="13.2.1">
+<h3 data-number="13.2.1" class="anchored" data-anchor-id="minimizing-an-arbitrary-1d-function"><span class="header-section-number">13.2.1</span> Minimizing an Arbitrary 1D Function</h3>
+<p>Let’s consider an arbitrary function. Our goal is to find the value of <span class="math inline">\(x\)</span> that minimizes this function.</p>
+<div id="882bbd3e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> arbitrary(x):</span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (x<span class="op">**</span><span class="dv">4</span> <span class="op">-</span> <span class="dv">15</span><span class="op">*</span>x<span class="op">**</span><span class="dv">3</span> <span class="op">+</span> <span class="dv">80</span><span class="op">*</span>x<span class="op">**</span><span class="dv">2</span> <span class="op">-</span> <span class="dv">180</span><span class="op">*</span>x <span class="op">+</span> <span class="dv">144</span>)<span class="op">/</span><span class="dv">10</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p><img src="images/arbitrary.png" alt="arbitrary" width="600"></p>
+<section id="the-naive-approach-guess-and-check" class="level4" data-number="13.2.1.1">
+<h4 data-number="13.2.1.1" class="anchored" data-anchor-id="the-naive-approach-guess-and-check"><span class="header-section-number">13.2.1.1</span> The Naive Approach: Guess and Check</h4>
+<p>Above, we saw that the minimum is somewhere around 5.3. Let’s see if we can figure out how to find the exact minimum algorithmically from scratch. One very slow (and terrible) way would be manual guess-and-check.</p>
+<div id="c7e8d417" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>arbitrary(<span class="dv">6</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<pre><code>0.0</code></pre>
+</div>
+</div>
+<p>A somewhat better (but still slow) approach is to use brute force to try out a bunch of x values and return the one that yields the lowest loss.</p>
+<div id="33d7a516" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> simple_minimize(f, xs):</span>
+<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Takes in a function f and a set of values xs. </span></span>
+<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Calculates the value of the function f at all values x in xs</span></span>
+<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Takes the minimum value of f(x) and returns the corresponding value x </span></span>
+<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a>    y <span class="op">=</span> [f(x) <span class="cf">for</span> x <span class="kw">in</span> xs]  </span>
+<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> xs[np.argmin(y)]</span>
+<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a>guesses <span class="op">=</span> [<span class="fl">5.3</span>, <span class="fl">5.31</span>, <span class="fl">5.32</span>, <span class="fl">5.33</span>, <span class="fl">5.34</span>, <span class="fl">5.35</span>]</span>
+<span id="cb26-9"><a href="#cb26-9" aria-hidden="true" tabindex="-1"></a>simple_minimize(arbitrary, guesses)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="15">
+<pre><code>5.33</code></pre>
+</div>
+</div>
+<p>This process is essentially the same as before where we made a graphical plot, it’s just that we’re only looking at 20 selected points.</p>
+<div id="2b2eff93" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="16">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a>xs <span class="op">=</span> np.linspace(<span class="dv">1</span>, <span class="dv">7</span>, <span class="dv">200</span>)</span>
+<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>sparse_xs <span class="op">=</span> np.linspace(<span class="dv">1</span>, <span class="dv">7</span>, <span class="dv">5</span>)</span>
+<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a>ys <span class="op">=</span> arbitrary(xs)</span>
+<span id="cb28-5"><a href="#cb28-5" aria-hidden="true" tabindex="-1"></a>sparse_ys <span class="op">=</span> arbitrary(sparse_xs)</span>
+<span id="cb28-6"><a href="#cb28-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb28-7"><a href="#cb28-7" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.line(x <span class="op">=</span> xs, y <span class="op">=</span> arbitrary(xs))</span>
+<span id="cb28-8"><a href="#cb28-8" aria-hidden="true" tabindex="-1"></a>fig.add_scatter(x <span class="op">=</span> sparse_xs, y <span class="op">=</span> arbitrary(sparse_xs), mode <span class="op">=</span> <span class="st">"markers"</span>)</span>
+<span id="cb28-9"><a href="#cb28-9" aria-hidden="true" tabindex="-1"></a>fig.update_layout(showlegend<span class="op">=</span> <span class="va">False</span>)</span>
+<span id="cb28-10"><a href="#cb28-10" aria-hidden="true" tabindex="-1"></a>fig.update_layout(autosize<span class="op">=</span><span class="va">False</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>)</span>
+<span id="cb28-11"><a href="#cb28-11" aria-hidden="true" tabindex="-1"></a>fig.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="8f2facc7-2059-46be-a035-6a29ac38771d" class="plotly-graph-div" style="height:600px; width:800px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("8f2facc7-2059-46be-a035-6a29ac38771d")) {                    Plotly.newPlot(                        "8f2facc7-2059-46be-a035-6a29ac38771d",                        [{"hovertemplate":"x=%{x}\u003cbr\u003ey=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","line":{"color":"#636efa","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"","orientation":"v","showlegend":false,"x":[1.0,1.0301507537688441,1.0603015075376885,1.0904522613065326,1.120603015075377,1.150753768844221,1.1809045226130652,1.2110552763819096,1.2412060301507537,1.271356783919598,1.3015075376884422,1.3316582914572863,1.3618090452261307,1.3919597989949748,1.4221105527638191,1.4522613065326633,1.4824120603015074,1.5125628140703518,1.542713567839196,1.5728643216080402,1.6030150753768844,1.6331658291457285,1.6633165829145728,1.6934673366834172,1.7236180904522613,1.7537688442211055,1.7839195979899496,1.814070351758794,1.8442211055276383,1.8743718592964824,1.9045226130653266,1.9346733668341707,1.964824120603015,1.9949748743718594,2.0251256281407035,2.0552763819095476,2.085427135678392,2.115577889447236,2.1457286432160805,2.1758793969849246,2.2060301507537687,2.2361809045226133,2.266331658291457,2.2964824120603016,2.3266331658291457,2.35678391959799,2.3869346733668344,2.417085427135678,2.4472361809045227,2.477386934673367,2.507537688442211,2.5376884422110555,2.567839195979899,2.5979899497487438,2.628140703517588,2.658291457286432,2.6884422110552766,2.7185929648241203,2.748743718592965,2.778894472361809,2.809045226130653,2.8391959798994977,2.8693467336683414,2.899497487437186,2.92964824120603,2.959798994974874,2.9899497487437188,3.020100502512563,3.050251256281407,3.080402010050251,3.1105527638190953,3.1407035175879394,3.170854271356784,3.201005025125628,3.2311557788944723,3.2613065326633164,3.2914572864321605,3.321608040201005,3.351758793969849,3.3819095477386933,3.4120603015075375,3.4422110552763816,3.472361809045226,3.5025125628140703,3.5326633165829144,3.5628140703517586,3.5929648241206027,3.6231155778894473,3.6532663316582914,3.6834170854271355,3.7135678391959797,3.743718592964824,3.7738693467336684,3.8040201005025125,3.8341708542713566,3.8643216080402008,3.8944723618090453,3.9246231155778895,3.9547738693467336,3.9849246231155777,4.015075376884422,4.045226130653266,4.075376884422111,4.105527638190955,4.135678391959798,4.165829145728643,4.1959798994974875,4.226130653266331,4.256281407035176,4.28643216080402,4.316582914572864,4.346733668341709,4.376884422110553,4.407035175879397,4.4371859296482405,4.467336683417085,4.49748743718593,4.527638190954773,4.557788944723618,4.5879396984924625,4.618090452261306,4.648241206030151,4.678391959798995,4.708542713567839,4.738693467336683,4.768844221105527,4.798994974874372,4.829145728643216,4.85929648241206,4.889447236180905,4.919597989949748,4.949748743718593,4.9798994974874375,5.010050251256281,5.040201005025126,5.0703517587939695,5.100502512562814,5.130653266331658,5.160804020100502,5.190954773869347,5.221105527638191,5.251256281407035,5.281407035175879,5.311557788944723,5.341708542713568,5.371859296482412,5.402010050251256,5.4321608040201,5.4623115577889445,5.492462311557789,5.522613065326633,5.552763819095477,5.582914572864321,5.613065326633166,5.64321608040201,5.673366834170854,5.703517587939698,5.733668341708542,5.763819095477387,5.793969849246231,5.824120603015075,5.8542713567839195,5.884422110552763,5.914572864321608,5.944723618090452,5.974874371859296,6.005025125628141,6.035175879396984,6.065326633165829,6.0954773869346734,6.125628140703517,6.155778894472362,6.185929648241205,6.21608040201005,6.2462311557788945,6.276381909547738,6.306532663316583,6.3366834170854265,6.366834170854271,6.396984924623116,6.427135678391959,6.457286432160804,6.487437185929648,6.517587939698492,6.547738693467337,6.57788944723618,6.608040201005025,6.638190954773869,6.668341708542713,6.698492462311558,6.7286432160804015,6.758793969849246,6.788944723618091,6.819095477386934,6.849246231155779,6.879396984924623,6.909547738693467,6.939698492462312,6.969849246231155,7.0],"xaxis":"x","y":[3.0,2.8197775132646994,2.6468296407545298,2.480978457571409,2.3220480221881674,2.169864376448527,2.0242555455671196,1.8850515381294826,1.7520843460920474,1.6251879447821538,1.5041982928980473,1.3889533325088705,1.2792929890546703,1.175059171346399,1.076095771565909,0.9822486652659563,0.8933657113701969,0.809296752173205,0.7298936133404282,0.6550101039082478,0.5845020162839318,0.5182271262456482,0.45604519294247436,0.39781795889439875,0.34340914999228855,0.2926844754979413,0.24551162804404497,0.20176028363418083,0.1613021016428462,0.1240107248154402,0.08976177926825812,0.05843287448851129,0.029903603334304307,0.0040555420346265695,-0.019227749810596606,-0.04006072923056081,-0.0585558698835257,-0.07482366205689459,-0.08897261266714337,-0.10110924525984047,-0.11133810000965809,-0.119761733720361,-0.12648071982483203,-0.1315936483850237,-0.13519712609199247,-0.13738577626590426,-0.1382522388560119,-0.13788717044065493,-0.13637924422729383,-0.13381515005247593,-0.13027959438184666,-0.1258553003101497,-0.12062300756120407,-0.11466147248795551,-0.10804746807244214,-0.10085578392577758,-0.09315922628821909,-0.08502861802905386,-0.0765327986467014,-0.06773862426869641,-0.05871096765167181,-0.04951271818131318,-0.04020478187242702,-0.030846081368940757,-0.021493555943828825,-0.012202161499220664,-0.0030248705662870635,0.005987327694657552,0.014785427494228998,0.023322406413939234,0.03155322540619636,0.039434828794344415,0.046926144272549666,0.053988082905993905,0.06058353913065275,0.06667739075349459,0.0722364989522987,0.0772297082758655,0.08162784664377228,0.0854037253466231,0.0885321390457932,0.09098986577366759,0.09275566693348196,0.0938102872993909,0.09413645501645647,0.09371888160064827,0.09254426193878089,0.09060127428864462,0.08788058027890884,0.08437482490911634,0.08007863654980837,0.07498862694224044,0.06910339119879154,0.062423507802634504,0.05495153860777009,0.04669202883931121,0.037651507093005424,0.027838485335746555,0.01726345890522225,0.005938906509970821,-0.006120709770470967,-0.018898944485715673,-0.03237736881442288,-0.04653557056435602,-0.061351154172564294,-0.07679974070485969,-0.0928549678563968,-0.10948848995135449,-0.12666997794297002,-0.14436711941368685,-0.16254561857489308,-0.1811691962672853,-0.2001995899604026,-0.21959655375303555,-0.23931785837306735,-0.25931929117742814,-0.2795546561521974,-0.2999757739125698,-0.32053248170270765,-0.34117263339597914,-0.3618420994948906,-0.38248476713084756,-0.40304254006464363,-0.4234553386859261,-0.4436611000135485,-0.46359577769538873,-0.483193342008542,-0.5023857798591053,-0.5211030947822792,-0.5392733069424139,-0.5568224531329065,-0.5736745867762579,-0.589751777924107,-0.6049741132570944,-0.6192596960850778,-0.632524646347008,-0.6446831006107573,-0.6556472120734724,-0.6653271505614157,-0.6736311025297482,-0.6804652710629284,-0.6857338758744731,-0.689339153306878,-0.6911813563318902,-0.6911587545502471,-0.6891676341918014,-0.6851022981155552,-0.6788550658095346,-0.6703162733909721,-0.6593742736060904,-0.6459154358302271,-0.6298241460678924,-0.6109828069525293,-0.5892718377469237,-0.5645696743426925,-0.5367527692607041,-0.5056955916510105,-0.47127062729252883,-0.4333483785933822,-0.39179736459091147,-0.3464841209514134,-0.2972731999702091,-0.24402717057191695,-0.18660661831017933,-0.12487014536754941,-0.05867437055599112,0.01212607068350735,0.08767852628116088,0.1681323275376599,0.2536387891250115,0.34435120908576666,0.4404248688334974,0.5420170331528652,0.6492869501990981,0.7623958514986725,0.8815069519487224,1.0067854498173574,1.13839852674364,1.2765153477374043,1.421307061179641,1.572946798821863,1.7316096757870127,1.897472790568304,2.070715225030267,2.251518044408317,2.4400642973086635,2.636539015708513,2.841129214955754,3.0540238937694992,3.27541403423952,3.5054926018266315,3.7444545453624185,3.9924967970495344,4.249818272461312,4.516619870542331,4.793104473607627,5.079476947343528,5.37594414080711,5.682714886426311,6.0],"yaxis":"y","type":"scatter"},{"mode":"markers","x":[1.0,2.5,4.0,5.5,7.0],"y":[3.0,-0.13125,0.0,-0.65625,6.0],"type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"x"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"y"}},"legend":{"tracegroupgap":0},"showlegend":false,"autosize":false,"width":800,"height":600},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('8f2facc7-2059-46be-a035-6a29ac38771d');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>This basic approach suffers from three major flaws:</p>
+<ol type="1">
+<li>If the minimum is outside our range of guesses, the answer will be completely wrong.</li>
+<li>Even if our range of guesses is correct, if the guesses are too coarse, our answer will be inaccurate.</li>
+<li>It is <em>very</em> computationally inefficient, considering potentially vast numbers of guesses that are useless.</li>
+</ol>
+</section>
+<section id="scipy.optimize.minimize" class="level4" data-number="13.2.1.2">
+<h4 data-number="13.2.1.2" class="anchored" data-anchor-id="scipy.optimize.minimize"><span class="header-section-number">13.2.1.2</span> <code>Scipy.optimize.minimize</code></h4>
+<p>One way to minimize this mathematical function is to use the <code>scipy.optimize.minimize</code> function. It takes a function and a starting guess and tries to find the minimum.</p>
+<div id="83f77ed5" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.optimize <span class="im">import</span> minimize</span>
+<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a><span class="co"># takes a function f and a starting point x0 and returns a readout </span></span>
+<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a><span class="co"># with the optimal input value of x which minimizes f</span></span>
+<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a>minimize(arbitrary, x0 <span class="op">=</span> <span class="fl">3.5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="17">
+<pre><code>  message: Optimization terminated successfully.
+  success: True
+   status: 0
+      fun: -0.13827491292966557
+        x: [ 2.393e+00]
+      nit: 3
+      jac: [ 6.486e-06]
+ hess_inv: [[ 7.385e-01]]
+     nfev: 20
+     njev: 10</code></pre>
+</div>
+</div>
+<p><code>scipy.optimize.minimize</code> is great. It may also seem a bit magical. How could you write a function that can find the minimum of any mathematical function? There are a number of ways to do this, which we’ll explore in today’s lecture, eventually arriving at the important idea of <strong>gradient descent</strong>, which is the principle that <code>scipy.optimize.minimize</code> uses.</p>
+<p>It turns out that under the hood, the <code>fit</code> method for <code>LinearRegression</code> models uses gradient descent. Gradient descent is also how much of machine learning works, including even advanced neural network models.</p>
+<p>In Data 100, the gradient descent process will usually be invisible to us, hidden beneath an abstraction layer. However, to be good data scientists, it’s important that we know the underlying principles that optimization functions harness to find optimal parameters.</p>
+</section>
+<section id="digging-into-gradient-descent" class="level4" data-number="13.2.1.3">
+<h4 data-number="13.2.1.3" class="anchored" data-anchor-id="digging-into-gradient-descent"><span class="header-section-number">13.2.1.3</span> Digging into Gradient Descent</h4>
+<p>Looking at the function across this domain, it is clear that the function’s minimum value occurs around <span class="math inline">\(\theta = 5.3\)</span>. Let’s pretend for a moment that we <em>couldn’t</em> see the full view of the cost function. How would we guess the value of <span class="math inline">\(\theta\)</span> that minimizes the function?</p>
+<p>It turns out that the first derivative of the function can give us a clue. In the plots below, the line indicates the value of the derivative of each value of <span class="math inline">\(\theta\)</span>. The derivative is negative where it is red and positive where it is green.</p>
+<p>Say we make a guess for the minimizing value of <span class="math inline">\(\theta\)</span>. Remember that we read plots from left to right, and assume that our starting <span class="math inline">\(\theta\)</span> value is to the left of the optimal <span class="math inline">\(\hat{\theta}\)</span>. If the guess “undershoots” the true minimizing value – our guess for <span class="math inline">\(\theta\)</span> is lower than the value of the <span class="math inline">\(\hat{\theta}\)</span> that minimizes the function – the derivative will be <strong>negative</strong>. This means that if we increase <span class="math inline">\(\theta\)</span> (move further to the right), then we <strong>can decrease</strong> our loss function further. If this guess “overshoots” the true minimizing value, the derivative will be positive, implying the converse.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/step.png" alt="step" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>We can use this pattern to help formulate our next guess for the optimal <span class="math inline">\(\hat{\theta}\)</span>. Consider the case where we’ve undershot <span class="math inline">\(\theta\)</span> by guessing too low of a value. We’ll want our next guess to be greater in value than our previous guess – that is, we want to shift our guess to the right. You can think of this as following the slope “downhill” to the function’s minimum value.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/neg_step.png" alt="neg_step" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>If we’ve overshot <span class="math inline">\(\hat{\theta}\)</span> by guessing too high of a value, we’ll want our next guess to be lower in value – we want to shift our guess for <span class="math inline">\(\hat{\theta}\)</span> to the left.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/pos_step.png" alt="pos_step" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>In other words, the derivative of the function at each point tells us the direction of our next guess.</p>
+<ul>
+<li>A negative slope means we want to step to the right, or move in the <em>positive</em> direction.</li>
+<li>A positive slope means we want to step to the left, or move in the <em>negative</em> direction.</li>
+</ul>
+</section>
+<section id="algorithm-attempt-1" class="level4" data-number="13.2.1.4">
+<h4 data-number="13.2.1.4" class="anchored" data-anchor-id="algorithm-attempt-1"><span class="header-section-number">13.2.1.4</span> Algorithm Attempt 1</h4>
+<p>Armed with this knowledge, let’s try to see if we can use the derivative to optimize the function.</p>
+<p>We start by making some guess for the minimizing value of <span class="math inline">\(x\)</span>. Then, we look at the derivative of the function at this value of <span class="math inline">\(x\)</span>, and step downhill in the <em>opposite</em> direction. We can express our new rule as a recurrence relation:</p>
+<p><span class="math display">\[x^{(t+1)} = x^{(t)} - \frac{d}{dx} f(x^{(t)})\]</span></p>
+<p>Translating this statement into English: we obtain <strong>our next guess</strong> for the minimizing value of <span class="math inline">\(x\)</span> at timestep <span class="math inline">\(t+1\)</span> (<span class="math inline">\(x^{(t+1)}\)</span>) by taking <strong>our last guess</strong> (<span class="math inline">\(x^{(t)}\)</span>) and subtracting the <strong>derivative of the function</strong> at that point (<span class="math inline">\(\frac{d}{dx} f(x^{(t)})\)</span>).</p>
+<p>A few steps are shown below, where the old step is shown as a transparent point, and the next step taken is the green-filled dot.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/grad_descent_1.png" alt="grad_descent_2" width="800">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>Looking pretty good! We do have a problem though – once we arrive close to the minimum value of the function, our guesses “bounce” back and forth past the minimum without ever reaching it.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/grad_descent_2.png" alt="grad_descent_2" width="500">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>In other words, each step we take when updating our guess moves us too far. We can address this by decreasing the size of each step.</p>
+</section>
+<section id="algorithm-attempt-2" class="level4" data-number="13.2.1.5">
+<h4 data-number="13.2.1.5" class="anchored" data-anchor-id="algorithm-attempt-2"><span class="header-section-number">13.2.1.5</span> Algorithm Attempt 2</h4>
+<p>Let’s update our algorithm to use a <strong>learning rate</strong> (also sometimes called the step size), which controls how far we move with each update. We represent the learning rate with <span class="math inline">\(\alpha\)</span>.</p>
+<p><span class="math display">\[x^{(t+1)} = x^{(t)} - \alpha \frac{d}{dx} f(x^{(t)})\]</span></p>
+<p>A small <span class="math inline">\(\alpha\)</span> means that we will take small steps; a large <span class="math inline">\(\alpha\)</span> means we will take large steps. When do we stop updating? We stop updating either after a fixed number of updates or after a subsequent update doesn’t change much.</p>
+<p>Updating our function to use <span class="math inline">\(\alpha=0.3\)</span>, our algorithm successfully <strong>converges</strong> (settles on a solution and stops updating significantly, or at all) on the minimum value.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/grad_descent_3.png" alt="grad_descent_3" width="500">
+</td>
+</tr>
+</tbody></table>
+</div>
+</section>
+</section>
+<section id="convexity" class="level3" data-number="13.2.2">
+<h3 data-number="13.2.2" class="anchored" data-anchor-id="convexity"><span class="header-section-number">13.2.2</span> Convexity</h3>
+<p>In our analysis above, we focused our attention on the global minimum of the loss function. You may be wondering: what about the local minimum that’s just to the left?</p>
+<p>If we had chosen a different starting guess for <span class="math inline">\(\theta\)</span>, or a different value for the learning rate <span class="math inline">\(\alpha\)</span>, our algorithm may have gotten “stuck” and converged on the local minimum, rather than on the true optimum value of loss.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/local.png" alt="local" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>If the loss function is <strong>convex</strong>, gradient descent is guaranteed to converge and find the global minimum of the objective function. Formally, a function <span class="math inline">\(f\)</span> is convex if: <span class="math display">\[tf(a) + (1-t)f(b) \geq f(ta + (1-t)b)\]</span> for all <span class="math inline">\(a, b\)</span> in the domain of <span class="math inline">\(f\)</span> and <span class="math inline">\(t \in [0, 1]\)</span>.</p>
+<p>To put this into words: if you drew a line between any two points on the curve, all values on the curve must be <em>on or below</em> the line. Importantly, any local minimum of a convex function is also its global minimum so we avoid the situation where the algorithm converges on some critical point that is not the minimum of the function.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/convex.png" alt="convex" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>In summary, non-convex loss functions can cause problems with optimization. This means that our choice of loss function is a key factor in our modeling process. It turns out that MSE <em>is</em> convex, which is a major reason why it is such a popular choice of loss function. Gradient descent is only guaranteed to converge (given enough iterations and an appropriate step size) for convex functions.</p>
+</section>
+<section id="gradient-descent-in-1-dimension" class="level3" data-number="13.2.3">
+<h3 data-number="13.2.3" class="anchored" data-anchor-id="gradient-descent-in-1-dimension"><span class="header-section-number">13.2.3</span> Gradient Descent in 1 Dimension</h3>
+<blockquote class="blockquote">
+<p><strong>Terminology clarification</strong>: In past lectures, we have used “loss” to refer to the error incurred on a <em>single</em> datapoint. In applications, we usually care more about the average error across <em>all</em> datapoints. Going forward, we will take the “model’s loss” to mean the model’s average error across the dataset. This is sometimes also known as the empirical risk, cost function, or objective function. <span class="math display">\[L(\theta) = R(\theta) = \frac{1}{n} \sum_{i=1}^{n} L(y, \hat{y})\]</span></p>
+</blockquote>
+<p>In our discussion above, we worked with some arbitrary function <span class="math inline">\(f\)</span>. As data scientists, we will almost always work with gradient descent in the context of optimizing <em>models</em> – specifically, we want to apply gradient descent to find the minimum of a <em>loss function</em>. In a modeling context, our goal is to minimize a loss function by choosing the minimizing model <em>parameters</em>.</p>
+<p>Recall our modeling workflow from the past few lectures:</p>
+<ol type="1">
+<li>Define a model with some parameters <span class="math inline">\(\theta_i\)</span></li>
+<li>Choose a loss function</li>
+<li>Select the values of <span class="math inline">\(\theta_i\)</span> that minimize the loss function on the data</li>
+</ol>
+<p>Gradient descent is a powerful technique for completing this last task. By applying the gradient descent algorithm, we can select values for our parameters <span class="math inline">\(\theta_i\)</span> that will lead to the model having minimal loss on the training data.</p>
+<p>When using gradient descent in a modeling context, we:</p>
+<ol type="1">
+<li>Make guesses for the minimizing <span class="math inline">\(\theta_i\)</span></li>
+<li>Compute the derivative of the loss function <span class="math inline">\(L\)</span></li>
+</ol>
+<p>We can “translate” our gradient descent rule from before by replacing <span class="math inline">\(x\)</span> with <span class="math inline">\(\theta\)</span> and <span class="math inline">\(f\)</span> with <span class="math inline">\(L\)</span>:</p>
+<p><span class="math display">\[\theta^{(t+1)} = \theta^{(t)} - \alpha \frac{d}{d\theta} L(\theta^{(t)})\]</span></p>
+<section id="gradient-descent-on-the-tips-dataset" class="level4" data-number="13.2.3.1">
+<h4 data-number="13.2.3.1" class="anchored" data-anchor-id="gradient-descent-on-the-tips-dataset"><span class="header-section-number">13.2.3.1</span> Gradient Descent on the <code>tips</code> Dataset</h4>
+<p>To see this in action, let’s consider a case where we have a linear model with no offset. We want to predict the tip (y) given the price of a meal (x). To do this, we</p>
+<ul>
+<li>Choose a model: <span class="math inline">\(\hat{y} = \theta_1 x\)</span>,</li>
+<li>Choose a loss function: <span class="math inline">\(L(\theta) = MSE(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \theta_1x_i)^2\)</span>.</li>
+</ul>
+<p>Let’s apply our <code>gradient_descent</code> function from before to optimize our model on the <code>tips</code> dataset. We will try to select the best parameter <span class="math inline">\(\theta_i\)</span> to predict the <code>tip</code> <span class="math inline">\(y\)</span> from the <code>total_bill</code> <span class="math inline">\(x\)</span>.</p>
+<div id="8a05095c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> sns.load_dataset(<span class="st">"tips"</span>)</span>
+<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a>df.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="18">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">total_bill</th>
+<th data-quarto-table-cell-role="th">tip</th>
+<th data-quarto-table-cell-role="th">sex</th>
+<th data-quarto-table-cell-role="th">smoker</th>
+<th data-quarto-table-cell-role="th">day</th>
+<th data-quarto-table-cell-role="th">time</th>
+<th data-quarto-table-cell-role="th">size</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>16.99</td>
+<td>1.01</td>
+<td>Female</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>2</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>10.34</td>
+<td>1.66</td>
+<td>Male</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>3</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>21.01</td>
+<td>3.50</td>
+<td>Male</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>3</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>23.68</td>
+<td>3.31</td>
+<td>Male</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>2</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>24.59</td>
+<td>3.61</td>
+<td>Female</td>
+<td>No</td>
+<td>Sun</td>
+<td>Dinner</td>
+<td>4</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We can visualize the value of the MSE on our dataset for different possible choices of <span class="math inline">\(\theta_1\)</span>. To optimize our model, we want to select the value of <span class="math inline">\(\theta_1\)</span> that leads to the lowest MSE.</p>
+<div id="b49ac3d7" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="19">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.graph_objects <span class="im">as</span> go</span>
+<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> derivative_arbitrary(x):</span>
+<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (<span class="dv">4</span><span class="op">*</span>x<span class="op">**</span><span class="dv">3</span> <span class="op">-</span> <span class="dv">45</span><span class="op">*</span>x<span class="op">**</span><span class="dv">2</span> <span class="op">+</span> <span class="dv">160</span><span class="op">*</span>x <span class="op">-</span> <span class="dv">180</span>)<span class="op">/</span><span class="dv">10</span></span>
+<span id="cb32-5"><a href="#cb32-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb32-6"><a href="#cb32-6" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> go.Figure()</span>
+<span id="cb32-7"><a href="#cb32-7" aria-hidden="true" tabindex="-1"></a>roots <span class="op">=</span> np.array([<span class="fl">2.3927</span>, <span class="fl">3.5309</span>, <span class="fl">5.3263</span>])</span>
+<span id="cb32-8"><a href="#cb32-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb32-9"><a href="#cb32-9" aria-hidden="true" tabindex="-1"></a>fig.add_trace(go.Scatter(x <span class="op">=</span> xs, y <span class="op">=</span> arbitrary(xs), </span>
+<span id="cb32-10"><a href="#cb32-10" aria-hidden="true" tabindex="-1"></a>                         mode <span class="op">=</span> <span class="st">"lines"</span>, name <span class="op">=</span> <span class="st">"f"</span>))</span>
+<span id="cb32-11"><a href="#cb32-11" aria-hidden="true" tabindex="-1"></a>fig.add_trace(go.Scatter(x <span class="op">=</span> xs, y <span class="op">=</span> derivative_arbitrary(xs), </span>
+<span id="cb32-12"><a href="#cb32-12" aria-hidden="true" tabindex="-1"></a>                         mode <span class="op">=</span> <span class="st">"lines"</span>, name <span class="op">=</span> <span class="st">"df"</span>, line <span class="op">=</span> {<span class="st">"dash"</span>: <span class="st">"dash"</span>}))</span>
+<span id="cb32-13"><a href="#cb32-13" aria-hidden="true" tabindex="-1"></a>fig.add_trace(go.Scatter(x <span class="op">=</span> np.array(roots), y <span class="op">=</span> <span class="dv">0</span><span class="op">*</span>roots, </span>
+<span id="cb32-14"><a href="#cb32-14" aria-hidden="true" tabindex="-1"></a>                         mode <span class="op">=</span> <span class="st">"markers"</span>, name <span class="op">=</span> <span class="st">"df = zero"</span>, marker_size <span class="op">=</span> <span class="dv">12</span>))</span>
+<span id="cb32-15"><a href="#cb32-15" aria-hidden="true" tabindex="-1"></a>fig.update_layout(font_size <span class="op">=</span> <span class="dv">20</span>, yaxis_range<span class="op">=</span>[<span class="op">-</span><span class="dv">1</span>, <span class="dv">3</span>])</span>
+<span id="cb32-16"><a href="#cb32-16" aria-hidden="true" tabindex="-1"></a>fig.update_layout(autosize<span class="op">=</span><span class="va">False</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>)</span>
+<span id="cb32-17"><a href="#cb32-17" aria-hidden="true" tabindex="-1"></a>fig.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="16cd1df8-ae07-40de-ac64-8c903ded141a" class="plotly-graph-div" style="height:600px; width:800px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("16cd1df8-ae07-40de-ac64-8c903ded141a")) {                    Plotly.newPlot(                        "16cd1df8-ae07-40de-ac64-8c903ded141a",                        [{"mode":"lines","name":"f","x":[1.0,1.0301507537688441,1.0603015075376885,1.0904522613065326,1.120603015075377,1.150753768844221,1.1809045226130652,1.2110552763819096,1.2412060301507537,1.271356783919598,1.3015075376884422,1.3316582914572863,1.3618090452261307,1.3919597989949748,1.4221105527638191,1.4522613065326633,1.4824120603015074,1.5125628140703518,1.542713567839196,1.5728643216080402,1.6030150753768844,1.6331658291457285,1.6633165829145728,1.6934673366834172,1.7236180904522613,1.7537688442211055,1.7839195979899496,1.814070351758794,1.8442211055276383,1.8743718592964824,1.9045226130653266,1.9346733668341707,1.964824120603015,1.9949748743718594,2.0251256281407035,2.0552763819095476,2.085427135678392,2.115577889447236,2.1457286432160805,2.1758793969849246,2.2060301507537687,2.2361809045226133,2.266331658291457,2.2964824120603016,2.3266331658291457,2.35678391959799,2.3869346733668344,2.417085427135678,2.4472361809045227,2.477386934673367,2.507537688442211,2.5376884422110555,2.567839195979899,2.5979899497487438,2.628140703517588,2.658291457286432,2.6884422110552766,2.7185929648241203,2.748743718592965,2.778894472361809,2.809045226130653,2.8391959798994977,2.8693467336683414,2.899497487437186,2.92964824120603,2.959798994974874,2.9899497487437188,3.020100502512563,3.050251256281407,3.080402010050251,3.1105527638190953,3.1407035175879394,3.170854271356784,3.201005025125628,3.2311557788944723,3.2613065326633164,3.2914572864321605,3.321608040201005,3.351758793969849,3.3819095477386933,3.4120603015075375,3.4422110552763816,3.472361809045226,3.5025125628140703,3.5326633165829144,3.5628140703517586,3.5929648241206027,3.6231155778894473,3.6532663316582914,3.6834170854271355,3.7135678391959797,3.743718592964824,3.7738693467336684,3.8040201005025125,3.8341708542713566,3.8643216080402008,3.8944723618090453,3.9246231155778895,3.9547738693467336,3.9849246231155777,4.015075376884422,4.045226130653266,4.075376884422111,4.105527638190955,4.135678391959798,4.165829145728643,4.1959798994974875,4.226130653266331,4.256281407035176,4.28643216080402,4.316582914572864,4.346733668341709,4.376884422110553,4.407035175879397,4.4371859296482405,4.467336683417085,4.49748743718593,4.527638190954773,4.557788944723618,4.5879396984924625,4.618090452261306,4.648241206030151,4.678391959798995,4.708542713567839,4.738693467336683,4.768844221105527,4.798994974874372,4.829145728643216,4.85929648241206,4.889447236180905,4.919597989949748,4.949748743718593,4.9798994974874375,5.010050251256281,5.040201005025126,5.0703517587939695,5.100502512562814,5.130653266331658,5.160804020100502,5.190954773869347,5.221105527638191,5.251256281407035,5.281407035175879,5.311557788944723,5.341708542713568,5.371859296482412,5.402010050251256,5.4321608040201,5.4623115577889445,5.492462311557789,5.522613065326633,5.552763819095477,5.582914572864321,5.613065326633166,5.64321608040201,5.673366834170854,5.703517587939698,5.733668341708542,5.763819095477387,5.793969849246231,5.824120603015075,5.8542713567839195,5.884422110552763,5.914572864321608,5.944723618090452,5.974874371859296,6.005025125628141,6.035175879396984,6.065326633165829,6.0954773869346734,6.125628140703517,6.155778894472362,6.185929648241205,6.21608040201005,6.2462311557788945,6.276381909547738,6.306532663316583,6.3366834170854265,6.366834170854271,6.396984924623116,6.427135678391959,6.457286432160804,6.487437185929648,6.517587939698492,6.547738693467337,6.57788944723618,6.608040201005025,6.638190954773869,6.668341708542713,6.698492462311558,6.7286432160804015,6.758793969849246,6.788944723618091,6.819095477386934,6.849246231155779,6.879396984924623,6.909547738693467,6.939698492462312,6.969849246231155,7.0],"y":[3.0,2.8197775132646994,2.6468296407545298,2.480978457571409,2.3220480221881674,2.169864376448527,2.0242555455671196,1.8850515381294826,1.7520843460920474,1.6251879447821538,1.5041982928980473,1.3889533325088705,1.2792929890546703,1.175059171346399,1.076095771565909,0.9822486652659563,0.8933657113701969,0.809296752173205,0.7298936133404282,0.6550101039082478,0.5845020162839318,0.5182271262456482,0.45604519294247436,0.39781795889439875,0.34340914999228855,0.2926844754979413,0.24551162804404497,0.20176028363418083,0.1613021016428462,0.1240107248154402,0.08976177926825812,0.05843287448851129,0.029903603334304307,0.0040555420346265695,-0.019227749810596606,-0.04006072923056081,-0.0585558698835257,-0.07482366205689459,-0.08897261266714337,-0.10110924525984047,-0.11133810000965809,-0.119761733720361,-0.12648071982483203,-0.1315936483850237,-0.13519712609199247,-0.13738577626590426,-0.1382522388560119,-0.13788717044065493,-0.13637924422729383,-0.13381515005247593,-0.13027959438184666,-0.1258553003101497,-0.12062300756120407,-0.11466147248795551,-0.10804746807244214,-0.10085578392577758,-0.09315922628821909,-0.08502861802905386,-0.0765327986467014,-0.06773862426869641,-0.05871096765167181,-0.04951271818131318,-0.04020478187242702,-0.030846081368940757,-0.021493555943828825,-0.012202161499220664,-0.0030248705662870635,0.005987327694657552,0.014785427494228998,0.023322406413939234,0.03155322540619636,0.039434828794344415,0.046926144272549666,0.053988082905993905,0.06058353913065275,0.06667739075349459,0.0722364989522987,0.0772297082758655,0.08162784664377228,0.0854037253466231,0.0885321390457932,0.09098986577366759,0.09275566693348196,0.0938102872993909,0.09413645501645647,0.09371888160064827,0.09254426193878089,0.09060127428864462,0.08788058027890884,0.08437482490911634,0.08007863654980837,0.07498862694224044,0.06910339119879154,0.062423507802634504,0.05495153860777009,0.04669202883931121,0.037651507093005424,0.027838485335746555,0.01726345890522225,0.005938906509970821,-0.006120709770470967,-0.018898944485715673,-0.03237736881442288,-0.04653557056435602,-0.061351154172564294,-0.07679974070485969,-0.0928549678563968,-0.10948848995135449,-0.12666997794297002,-0.14436711941368685,-0.16254561857489308,-0.1811691962672853,-0.2001995899604026,-0.21959655375303555,-0.23931785837306735,-0.25931929117742814,-0.2795546561521974,-0.2999757739125698,-0.32053248170270765,-0.34117263339597914,-0.3618420994948906,-0.38248476713084756,-0.40304254006464363,-0.4234553386859261,-0.4436611000135485,-0.46359577769538873,-0.483193342008542,-0.5023857798591053,-0.5211030947822792,-0.5392733069424139,-0.5568224531329065,-0.5736745867762579,-0.589751777924107,-0.6049741132570944,-0.6192596960850778,-0.632524646347008,-0.6446831006107573,-0.6556472120734724,-0.6653271505614157,-0.6736311025297482,-0.6804652710629284,-0.6857338758744731,-0.689339153306878,-0.6911813563318902,-0.6911587545502471,-0.6891676341918014,-0.6851022981155552,-0.6788550658095346,-0.6703162733909721,-0.6593742736060904,-0.6459154358302271,-0.6298241460678924,-0.6109828069525293,-0.5892718377469237,-0.5645696743426925,-0.5367527692607041,-0.5056955916510105,-0.47127062729252883,-0.4333483785933822,-0.39179736459091147,-0.3464841209514134,-0.2972731999702091,-0.24402717057191695,-0.18660661831017933,-0.12487014536754941,-0.05867437055599112,0.01212607068350735,0.08767852628116088,0.1681323275376599,0.2536387891250115,0.34435120908576666,0.4404248688334974,0.5420170331528652,0.6492869501990981,0.7623958514986725,0.8815069519487224,1.0067854498173574,1.13839852674364,1.2765153477374043,1.421307061179641,1.572946798821863,1.7316096757870127,1.897472790568304,2.070715225030267,2.251518044408317,2.4400642973086635,2.636539015708513,2.841129214955754,3.0540238937694992,3.27541403423952,3.5054926018266315,3.7444545453624185,3.9924967970495344,4.249818272461312,4.516619870542331,4.793104473607627,5.079476947343528,5.37594414080711,5.682714886426311,6.0],"type":"scatter"},{"line":{"dash":"dash"},"mode":"lines","name":"df","x":[1.0,1.0301507537688441,1.0603015075376885,1.0904522613065326,1.120603015075377,1.150753768844221,1.1809045226130652,1.2110552763819096,1.2412060301507537,1.271356783919598,1.3015075376884422,1.3316582914572863,1.3618090452261307,1.3919597989949748,1.4221105527638191,1.4522613065326633,1.4824120603015074,1.5125628140703518,1.542713567839196,1.5728643216080402,1.6030150753768844,1.6331658291457285,1.6633165829145728,1.6934673366834172,1.7236180904522613,1.7537688442211055,1.7839195979899496,1.814070351758794,1.8442211055276383,1.8743718592964824,1.9045226130653266,1.9346733668341707,1.964824120603015,1.9949748743718594,2.0251256281407035,2.0552763819095476,2.085427135678392,2.115577889447236,2.1457286432160805,2.1758793969849246,2.2060301507537687,2.2361809045226133,2.266331658291457,2.2964824120603016,2.3266331658291457,2.35678391959799,2.3869346733668344,2.417085427135678,2.4472361809045227,2.477386934673367,2.507537688442211,2.5376884422110555,2.567839195979899,2.5979899497487438,2.628140703517588,2.658291457286432,2.6884422110552766,2.7185929648241203,2.748743718592965,2.778894472361809,2.809045226130653,2.8391959798994977,2.8693467336683414,2.899497487437186,2.92964824120603,2.959798994974874,2.9899497487437188,3.020100502512563,3.050251256281407,3.080402010050251,3.1105527638190953,3.1407035175879394,3.170854271356784,3.201005025125628,3.2311557788944723,3.2613065326633164,3.2914572864321605,3.321608040201005,3.351758793969849,3.3819095477386933,3.4120603015075375,3.4422110552763816,3.472361809045226,3.5025125628140703,3.5326633165829144,3.5628140703517586,3.5929648241206027,3.6231155778894473,3.6532663316582914,3.6834170854271355,3.7135678391959797,3.743718592964824,3.7738693467336684,3.8040201005025125,3.8341708542713566,3.8643216080402008,3.8944723618090453,3.9246231155778895,3.9547738693467336,3.9849246231155777,4.015075376884422,4.045226130653266,4.075376884422111,4.105527638190955,4.135678391959798,4.165829145728643,4.1959798994974875,4.226130653266331,4.256281407035176,4.28643216080402,4.316582914572864,4.346733668341709,4.376884422110553,4.407035175879397,4.4371859296482405,4.467336683417085,4.49748743718593,4.527638190954773,4.557788944723618,4.5879396984924625,4.618090452261306,4.648241206030151,4.678391959798995,4.708542713567839,4.738693467336683,4.768844221105527,4.798994974874372,4.829145728643216,4.85929648241206,4.889447236180905,4.919597989949748,4.949748743718593,4.9798994974874375,5.010050251256281,5.040201005025126,5.0703517587939695,5.100502512562814,5.130653266331658,5.160804020100502,5.190954773869347,5.221105527638191,5.251256281407035,5.281407035175879,5.311557788944723,5.341708542713568,5.371859296482412,5.402010050251256,5.4321608040201,5.4623115577889445,5.492462311557789,5.522613065326633,5.552763819095477,5.582914572864321,5.613065326633166,5.64321608040201,5.673366834170854,5.703517587939698,5.733668341708542,5.763819095477387,5.793969849246231,5.824120603015075,5.8542713567839195,5.884422110552763,5.914572864321608,5.944723618090452,5.974874371859296,6.005025125628141,6.035175879396984,6.065326633165829,6.0954773869346734,6.125628140703517,6.155778894472362,6.185929648241205,6.21608040201005,6.2462311557788945,6.276381909547738,6.306532663316583,6.3366834170854265,6.366834170854271,6.396984924623116,6.427135678391959,6.457286432160804,6.487437185929648,6.517587939698492,6.547738693467337,6.57788944723618,6.608040201005025,6.638190954773869,6.668341708542713,6.698492462311558,6.7286432160804015,6.758793969849246,6.788944723618091,6.819095477386934,6.849246231155779,6.879396984924623,6.909547738693467,6.939698492462312,6.969849246231155,7.0],"y":[-6.1,-5.855752779706215,-5.617439626099488,-5.384994757378214,-5.158352391740783,-4.937446747385573,-4.72221204251098,-4.512582495315394,-4.3084923239972,-4.109875746754784,-3.9166669817865367,-3.728800247290849,-3.546209761466102,-3.3688297425106897,-3.1965944086229996,-3.0294379780014196,-2.867294668844335,-2.710098699350141,-2.5577842877172143,-2.410285652143955,-2.2675370108287467,-2.129472581969978,-1.9960265837660303,-1.8671332344153029,-1.7427267521161809,-1.6227413550670406,-1.5071112614662923,-1.3957706895123068,-1.288653857403483,-1.1856949833381947,-1.0868282855148437,-0.9919879821318176,-0.9011082913874986,-0.8141234314802744,-0.7309676206085385,-0.6515750769706727,-0.5758800187650707,-0.5038166641901227,-0.4353192314442083,-0.37032193872572067,-0.30875900423305325,-0.2505646461645881,-0.1956730827187158,-0.14401853209381557,-0.09553521248829214,-0.050157342100516186,-0.007819139128892516,0.03154517822820253,0.0680013917723727,0.10161528330524447,0.1324526346284074,0.1605792275434908,0.18606084385210409,0.20896326535584536,0.22935227385634108,0.247293651155195,0.26285317905403077,0.2760966393544493,0.2870898138580628,0.295898484366478,0.30258843268132407,0.30722544060420204,0.30987528993671276,0.3106037624804969,0.3094766400371327,0.3065597044082608,0.3019187373954651,0.2956195208003862,0.28772783642461375,0.278309466069777,0.2674301915374713,0.2551557946293144,0.24155205714693012,0.2266847608919079,0.2106196876658714,0.19342261927042728,0.17515933750721616,0.15589562417781053,0.13569726108383975,0.11463003002689902,0.09275971280864041,0.07015209123063641,0.04687294709450498,0.022988062201864067,-0.0014367816456569925,-0.026335802646468665,-0.05164321899897004,-0.07729324890150338,-0.10322011055251892,-0.12935802215034756,-0.15564120189339975,-0.18200386798008594,-0.20838023860876548,-0.23470453197783173,-0.26091096628567245,-0.28693375973069807,-0.3127071305112679,-0.33816529682578106,-0.36324247687263095,-0.387872888850211,-0.4119907509568804,-0.4355302813910555,-0.4584256983510954,-0.48061122003543344,-0.5020210646424061,-0.5225894503704523,-0.5422505954179314,-0.560938717983231,-0.5785880362647674,-0.5951327684608827,-0.6105071327699989,-0.6246453473904694,-0.6374816305207218,-0.648950200359127,-0.6589852751040894,-0.6675210729539799,-0.6744918121071806,-0.6798317107620733,-0.6834749871170857,-0.6853558593705884,-0.6854085457209521,-0.6835672643665817,-0.6797662335058476,-0.6739396713371661,-0.666021796058908,-0.6559468258694551,-0.643648978967201,-0.6290624735505503,-0.6121215278178852,-0.5927603599675649,-0.5709131881979829,-0.546514230707578,-0.5194977056946982,-0.48979783135773686,-0.4573488258950988,-0.42208490750512057,-0.38394029438626376,-0.3428492047368536,-0.29874585675530624,-0.2515644686400151,-0.20123925858937355,-0.1477044448017068,-0.09089424547549925,-0.030742878809076047,0.032815436999146644,0.09984648375078678,0.17041604324746232,0.24458989729080258,0.3224338276823687,0.4040136162238241,0.48939504471677536,0.5786438949628178,0.6718259487635351,0.7690069879205907,0.8702527942355687,0.9756291495100868,1.0852018355457402,1.1990366341441927,1.3171993271069824,1.439755696235784,1.5667715233321815,1.6983125901977814,1.834444678634202,1.9752335704430606,2.1207450474259644,2.2710448913845083,2.4261988841203674,2.58627280743508,2.7513324431302637,2.9214435730075934,3.0966719788686077,3.2770834425149475,3.462743745748264,3.6537186703701194,3.8500739981821313,4.051875510985894,4.259188990583084,4.472080218775238,4.690614977364044,4.914859048151049,5.144878212937897,5.380738253526147,5.622504951717474,5.870244089313519,6.124021448115786,6.383902809925985,6.649953956545687,6.922240669776488,7.200828731420051,7.485783923277927,7.777172027151755,8.075058824843131,8.37951009815372,8.690591628885068,9.00836919883884,9.332908589816588,9.664275583619997,10.002535962050592,10.347755506910039,10.7],"type":"scatter"},{"marker":{"size":12},"mode":"markers","name":"df = zero","x":[2.3927,3.5309,5.3263],"y":[0.0,0.0,0.0],"type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"font":{"size":20},"yaxis":{"range":[-1,3]},"autosize":false,"width":800,"height":600},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('16cd1df8-ae07-40de-ac64-8c903ded141a');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>To apply gradient descent, we need to compute the derivative of the loss function with respect to our parameter <span class="math inline">\(\theta_1\)</span>.</p>
+<ul>
+<li>Given our loss function, <span class="math display">\[L(\theta) = MSE(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \theta_1x_i)^2\]</span></li>
+<li>We take the derivative with respect to <span class="math inline">\(\theta_1\)</span> <span class="math display">\[\frac{\partial}{\partial \theta_{1}} L(\theta_1^{(t)}) = \frac{-2}{n} \sum_{i=1}^n (y_i - \theta_1^{(t)} x_i) x_i\]</span></li>
+<li>Which results in the gradient descent update rule <span class="math display">\[\theta_1^{(t+1)} = \theta_1^{(t)} - \alpha \frac{d}{d\theta}L(\theta_1^{(t)})\]</span></li>
+</ul>
+<p>for some learning rate <span class="math inline">\(\alpha\)</span>.</p>
+<p>Implementing this in code, we can visualize the MSE loss on the <code>tips</code> data. <strong>MSE is convex</strong>, so there is one global minimum.</p>
+<div id="1ac5f6d9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="20">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gradient_descent(df, initial_guess, alpha, n):</span>
+<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""Performs n steps of gradient descent on df using learning rate alpha starting</span></span>
+<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a><span class="co">       from initial_guess. Returns a numpy array of all guesses over time."""</span></span>
+<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a>    guesses <span class="op">=</span> [initial_guess]</span>
+<span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a>    current_guess <span class="op">=</span> initial_guess</span>
+<span id="cb33-6"><a href="#cb33-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">while</span> <span class="bu">len</span>(guesses) <span class="op">&lt;</span> n:</span>
+<span id="cb33-7"><a href="#cb33-7" aria-hidden="true" tabindex="-1"></a>        current_guess <span class="op">=</span> current_guess <span class="op">-</span> alpha <span class="op">*</span> df(current_guess)</span>
+<span id="cb33-8"><a href="#cb33-8" aria-hidden="true" tabindex="-1"></a>        guesses.append(current_guess)</span>
+<span id="cb33-9"><a href="#cb33-9" aria-hidden="true" tabindex="-1"></a>        </span>
+<span id="cb33-10"><a href="#cb33-10" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.array(guesses)</span>
+<span id="cb33-11"><a href="#cb33-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb33-12"><a href="#cb33-12" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_single_arg(theta_1):</span>
+<span id="cb33-13"><a href="#cb33-13" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""Returns the MSE on our data for the given theta1"""</span></span>
+<span id="cb33-14"><a href="#cb33-14" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> df[<span class="st">"total_bill"</span>]</span>
+<span id="cb33-15"><a href="#cb33-15" aria-hidden="true" tabindex="-1"></a>    y_obs <span class="op">=</span> df[<span class="st">"tip"</span>]</span>
+<span id="cb33-16"><a href="#cb33-16" aria-hidden="true" tabindex="-1"></a>    y_hat <span class="op">=</span> theta_1 <span class="op">*</span> x</span>
+<span id="cb33-17"><a href="#cb33-17" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean((y_hat <span class="op">-</span> y_obs) <span class="op">**</span> <span class="dv">2</span>)</span>
+<span id="cb33-18"><a href="#cb33-18" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb33-19"><a href="#cb33-19" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_loss_derivative_single_arg(theta_1):</span>
+<span id="cb33-20"><a href="#cb33-20" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""Returns the derivative of the MSE on our data for the given theta1"""</span></span>
+<span id="cb33-21"><a href="#cb33-21" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> df[<span class="st">"total_bill"</span>]</span>
+<span id="cb33-22"><a href="#cb33-22" aria-hidden="true" tabindex="-1"></a>    y_obs <span class="op">=</span> df[<span class="st">"tip"</span>]</span>
+<span id="cb33-23"><a href="#cb33-23" aria-hidden="true" tabindex="-1"></a>    y_hat <span class="op">=</span> theta_1 <span class="op">*</span> x</span>
+<span id="cb33-24"><a href="#cb33-24" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb33-25"><a href="#cb33-25" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(<span class="dv">2</span> <span class="op">*</span> (y_hat <span class="op">-</span> y_obs) <span class="op">*</span> x)</span>
+<span id="cb33-26"><a href="#cb33-26" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb33-27"><a href="#cb33-27" aria-hidden="true" tabindex="-1"></a>loss_df <span class="op">=</span> pd.DataFrame({<span class="st">"theta_1"</span>:np.linspace(<span class="op">-</span><span class="fl">1.5</span>, <span class="dv">1</span>), <span class="st">"MSE"</span>:[mse_single_arg(theta_1) <span class="cf">for</span> theta_1 <span class="kw">in</span> np.linspace(<span class="op">-</span><span class="fl">1.5</span>, <span class="dv">1</span>)]})</span>
+<span id="cb33-28"><a href="#cb33-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb33-29"><a href="#cb33-29" aria-hidden="true" tabindex="-1"></a>trajectory <span class="op">=</span> gradient_descent(mse_loss_derivative_single_arg, <span class="op">-</span><span class="fl">0.5</span>, <span class="fl">0.0001</span>, <span class="dv">100</span>)</span>
+<span id="cb33-30"><a href="#cb33-30" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb33-31"><a href="#cb33-31" aria-hidden="true" tabindex="-1"></a>plt.plot(loss_df[<span class="st">"theta_1"</span>], loss_df[<span class="st">"MSE"</span>])</span>
+<span id="cb33-32"><a href="#cb33-32" aria-hidden="true" tabindex="-1"></a>plt.scatter(trajectory, [mse_single_arg(guess) <span class="cf">for</span> guess <span class="kw">in</span> trajectory], c<span class="op">=</span><span class="st">"white"</span>, edgecolor<span class="op">=</span><span class="st">"firebrick"</span>)</span>
+<span id="cb33-33"><a href="#cb33-33" aria-hidden="true" tabindex="-1"></a>plt.scatter(trajectory[<span class="op">-</span><span class="dv">1</span>], mse_single_arg(trajectory[<span class="op">-</span><span class="dv">1</span>]), c<span class="op">=</span><span class="st">"firebrick"</span>)</span>
+<span id="cb33-34"><a href="#cb33-34" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r"$\theta_1$"</span>)</span>
+<span id="cb33-35"><a href="#cb33-35" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="vs">r"$L(\theta_1)$"</span>)<span class="op">;</span></span>
+<span id="cb33-36"><a href="#cb33-36" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb33-37"><a href="#cb33-37" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Final guess for theta_1: </span><span class="sc">{</span>trajectory[<span class="op">-</span><span class="dv">1</span>]<span class="sc">}</span><span class="ss">"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>Final guess for theta_1: 0.14369554654231262</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="gradient_descent_files/figure-html/cell-21-output-2.png" width="603" height="431" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="gradient-descent-on-multi-dimensional-models" class="level3" data-number="13.2.4">
+<h3 data-number="13.2.4" class="anchored" data-anchor-id="gradient-descent-on-multi-dimensional-models"><span class="header-section-number">13.2.4</span> Gradient Descent on Multi-Dimensional Models</h3>
+<p>The function we worked with above was one-dimensional – we were only minimizing the function with respect to a single parameter, <span class="math inline">\(\theta\)</span>. However, models usually have a cost function with multiple parameters that need to be optimized. For example, simple linear regression has 2 parameters: <span class="math display">\[\hat{y} + \theta_0 + \theta_1x\]</span> and multiple linear regression has <span class="math inline">\(p+1\)</span> parameters: <span class="math display">\[\mathbb{Y} = \theta_0 + \theta_1 \Bbb{X}_{:,1} + \theta_2 \Bbb{X}_{:,2} + \cdots + \theta_p \Bbb{X}_{:,p}\]</span></p>
+<p>We’ll need to expand gradient descent so we can update our guesses for all model parameters all in one go.</p>
+<p>With multiple parameters to optimize, we consider a <strong>loss surface</strong>, or the model’s loss for a particular <em>combination</em> of possible parameter values.</p>
+<div id="83cbf2f3" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="21">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.graph_objects <span class="im">as</span> go</span>
+<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-4"><a href="#cb35-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_loss(theta, X, y_obs):</span>
+<span id="cb35-5"><a href="#cb35-5" aria-hidden="true" tabindex="-1"></a>    y_hat <span class="op">=</span> X <span class="op">@</span> theta</span>
+<span id="cb35-6"><a href="#cb35-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean((y_hat <span class="op">-</span> y_obs) <span class="op">**</span> <span class="dv">2</span>)    </span>
+<span id="cb35-7"><a href="#cb35-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-8"><a href="#cb35-8" aria-hidden="true" tabindex="-1"></a>tips_with_bias <span class="op">=</span> df.copy()</span>
+<span id="cb35-9"><a href="#cb35-9" aria-hidden="true" tabindex="-1"></a>tips_with_bias[<span class="st">"bias"</span>] <span class="op">=</span> <span class="dv">1</span></span>
+<span id="cb35-10"><a href="#cb35-10" aria-hidden="true" tabindex="-1"></a>tips_with_bias <span class="op">=</span> tips_with_bias[[<span class="st">"bias"</span>, <span class="st">"total_bill"</span>]]</span>
+<span id="cb35-11"><a href="#cb35-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-12"><a href="#cb35-12" aria-hidden="true" tabindex="-1"></a>uvalues <span class="op">=</span> np.linspace(<span class="dv">0</span>, <span class="dv">2</span>, <span class="dv">10</span>)</span>
+<span id="cb35-13"><a href="#cb35-13" aria-hidden="true" tabindex="-1"></a>vvalues <span class="op">=</span> np.linspace(<span class="op">-</span><span class="fl">0.1</span>, <span class="fl">0.35</span>, <span class="dv">10</span>)</span>
+<span id="cb35-14"><a href="#cb35-14" aria-hidden="true" tabindex="-1"></a>(u,v) <span class="op">=</span> np.meshgrid(uvalues, vvalues)</span>
+<span id="cb35-15"><a href="#cb35-15" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> np.vstack((u.flatten(),v.flatten()))</span>
+<span id="cb35-16"><a href="#cb35-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-17"><a href="#cb35-17" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_loss_single_arg(theta):</span>
+<span id="cb35-18"><a href="#cb35-18" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> mse_loss(theta, tips_with_bias, df[<span class="st">"tip"</span>])</span>
+<span id="cb35-19"><a href="#cb35-19" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-20"><a href="#cb35-20" aria-hidden="true" tabindex="-1"></a>MSE <span class="op">=</span> np.array([mse_loss_single_arg(t) <span class="cf">for</span> t <span class="kw">in</span> thetas.T])</span>
+<span id="cb35-21"><a href="#cb35-21" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-22"><a href="#cb35-22" aria-hidden="true" tabindex="-1"></a>loss_surface <span class="op">=</span> go.Surface(x<span class="op">=</span>u, y<span class="op">=</span>v, z<span class="op">=</span>np.reshape(MSE, u.shape))</span>
+<span id="cb35-23"><a href="#cb35-23" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-24"><a href="#cb35-24" aria-hidden="true" tabindex="-1"></a>ind <span class="op">=</span> np.argmin(MSE)</span>
+<span id="cb35-25"><a href="#cb35-25" aria-hidden="true" tabindex="-1"></a>optimal_point <span class="op">=</span> go.Scatter3d(name <span class="op">=</span> <span class="st">"Optimal Point"</span>,</span>
+<span id="cb35-26"><a href="#cb35-26" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> [thetas.T[ind,<span class="dv">0</span>]], y <span class="op">=</span> [thetas.T[ind,<span class="dv">1</span>]], </span>
+<span id="cb35-27"><a href="#cb35-27" aria-hidden="true" tabindex="-1"></a>    z <span class="op">=</span> [MSE[ind]],</span>
+<span id="cb35-28"><a href="#cb35-28" aria-hidden="true" tabindex="-1"></a>    marker<span class="op">=</span><span class="bu">dict</span>(size<span class="op">=</span><span class="dv">10</span>, color<span class="op">=</span><span class="st">"red"</span>))</span>
+<span id="cb35-29"><a href="#cb35-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-30"><a href="#cb35-30" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> go.Figure(data<span class="op">=</span>[loss_surface, optimal_point])</span>
+<span id="cb35-31"><a href="#cb35-31" aria-hidden="true" tabindex="-1"></a>fig.update_layout(scene <span class="op">=</span> <span class="bu">dict</span>(</span>
+<span id="cb35-32"><a href="#cb35-32" aria-hidden="true" tabindex="-1"></a>    xaxis_title <span class="op">=</span> <span class="st">"theta0"</span>,</span>
+<span id="cb35-33"><a href="#cb35-33" aria-hidden="true" tabindex="-1"></a>    yaxis_title <span class="op">=</span> <span class="st">"theta1"</span>,</span>
+<span id="cb35-34"><a href="#cb35-34" aria-hidden="true" tabindex="-1"></a>    zaxis_title <span class="op">=</span> <span class="st">"MSE"</span>), autosize<span class="op">=</span><span class="va">False</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>)</span>
+<span id="cb35-35"><a href="#cb35-35" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-36"><a href="#cb35-36" aria-hidden="true" tabindex="-1"></a>fig.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="10c2d7b4-b1b8-4a99-b358-0e8364b1dc58" class="plotly-graph-div" style="height:600px; width:800px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("10c2d7b4-b1b8-4a99-b358-0e8364b1dc58")) {                    Plotly.newPlot(                        "10c2d7b4-b1b8-4a99-b358-0e8364b1dc58",                        [{"x":[[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0]],"y":[[-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1],[-0.05000000000000001,-0.05000000000000001,-0.05000000000000001,-0.05000000000000001,-0.05000000000000001,-0.05000000000000001,-0.05000000000000001,-0.05000000000000001,-0.05000000000000001,-0.05000000000000001],[-1.3877787807814457e-17,-1.3877787807814457e-17,-1.3877787807814457e-17,-1.3877787807814457e-17,-1.3877787807814457e-17,-1.3877787807814457e-17,-1.3877787807814457e-17,-1.3877787807814457e-17,-1.3877787807814457e-17,-1.3877787807814457e-17],[0.04999999999999999,0.04999999999999999,0.04999999999999999,0.04999999999999999,0.04999999999999999,0.04999999999999999,0.04999999999999999,0.04999999999999999,0.04999999999999999,0.04999999999999999],[0.09999999999999998,0.09999999999999998,0.09999999999999998,0.09999999999999998,0.09999999999999998,0.09999999999999998,0.09999999999999998,0.09999999999999998,0.09999999999999998,0.09999999999999998],[0.14999999999999997,0.14999999999999997,0.14999999999999997,0.14999999999999997,0.14999999999999997,0.14999999999999997,0.14999999999999997,0.14999999999999997,0.14999999999999997,0.14999999999999997],[0.19999999999999998,0.19999999999999998,0.19999999999999998,0.19999999999999998,0.19999999999999998,0.19999999999999998,0.19999999999999998,0.19999999999999998,0.19999999999999998,0.19999999999999998],[0.24999999999999997,0.24999999999999997,0.24999999999999997,0.24999999999999997,0.24999999999999997,0.24999999999999997,0.24999999999999997,0.24999999999999997,0.24999999999999997,0.24999999999999997],[0.29999999999999993,0.29999999999999993,0.29999999999999993,0.29999999999999993,0.29999999999999993,0.29999999999999993,0.29999999999999993,0.29999999999999993,0.29999999999999993,0.29999999999999993],[0.35,0.35,0.35,0.35,0.35,0.35,0.35,0.35,0.35,0.35]],"z":[[29.123031979508195,26.96047116185995,24.896675776310463,22.931645822859743,21.065381301507795,19.297882212254603,17.629148555100183,16.059180330044526,14.587977537087637,13.21554017622951],[18.833628650614756,17.11075544680986,15.486647675103727,13.961305335496359,12.534728427987758,11.20691695257792,9.97787090926685,8.847590298054545,7.816075118941006,6.883325371926231],[10.896283606557377,9.613098016595831,8.428677858733053,7.343023132969036,6.356133839303786,5.468009977737301,4.678651548269582,3.9880585509006288,3.3962309856304405,2.903168852459017],[5.310996847336067,4.467498871217872,3.722766327198442,3.0767992152777786,2.5295975354558795,2.0811612877327472,1.7314904721083795,1.4805850885827772,1.3284451371559403,1.275070617827869],[2.0777683729508207,1.6739580106759773,1.3689130804998992,1.1626335824225869,1.05511951644404,1.0463708825642581,1.1363876807832418,1.325169911100991,1.6127175735175057,1.9990306680327863],[1.196598183401639,1.232475434970147,1.3671181186374208,1.6005262344034599,1.9326997822682639,2.3636387622318336,2.8933431742941687,3.521813018455269,4.249048294715135,5.075049003073768],[2.667486278688523,3.1430511441003834,3.717381441611009,4.390477171220399,5.162338332928555,6.032964926735477,7.002356952641165,8.070514410645615,9.237437300748834,10.503125622950817],[6.490432658811471,7.405685138066682,8.419703049420658,9.532486392873402,10.74403516842491,12.054349376075182,13.463429015824218,14.971274087672022,16.57788459161859,18.283260527663924],[12.665437323770481,14.020377416869042,15.474082942066373,17.02655389936246,18.677790288757322,20.427792110250945,22.27655936384334,24.224092049534487,26.27039016732441,28.4154537172131],[21.19250027356557,22.98712798050748,24.880521119548167,26.87267969068761,28.96360369392582,31.153293129262796,33.44174799669854,35.828968296233036,38.314954027866314,40.89970519159835]],"type":"surface"},{"marker":{"color":"red","size":10},"name":"Optimal Point","x":[1.1111111111111112],"y":[0.09999999999999998],"z":[1.0463708825642581],"type":"scatter3d"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"scene":{"xaxis":{"title":{"text":"theta0"}},"yaxis":{"title":{"text":"theta1"}},"zaxis":{"title":{"text":"MSE"}}},"autosize":false,"width":800,"height":600},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('10c2d7b4-b1b8-4a99-b358-0e8364b1dc58');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>We can also visualize a bird’s-eye view of the loss surface from above using a contour plot:</p>
+<div id="aa318bc7" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="22">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a>contour <span class="op">=</span> go.Contour(x<span class="op">=</span>u[<span class="dv">0</span>], y<span class="op">=</span>v[:, <span class="dv">0</span>], z<span class="op">=</span>np.reshape(MSE, u.shape))</span>
+<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> go.Figure(contour)</span>
+<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a>fig.update_layout(</span>
+<span id="cb36-4"><a href="#cb36-4" aria-hidden="true" tabindex="-1"></a>    xaxis_title <span class="op">=</span> <span class="st">"theta0"</span>,</span>
+<span id="cb36-5"><a href="#cb36-5" aria-hidden="true" tabindex="-1"></a>    yaxis_title <span class="op">=</span> <span class="st">"theta1"</span>, autosize<span class="op">=</span><span class="va">False</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>)</span>
+<span id="cb36-6"><a href="#cb36-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb36-7"><a href="#cb36-7" aria-hidden="true" tabindex="-1"></a>fig.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="8508f71c-b407-49fd-8e93-2f639da54f50" class="plotly-graph-div" style="height:600px; width:800px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("8508f71c-b407-49fd-8e93-2f639da54f50")) {                    Plotly.newPlot(                        "8508f71c-b407-49fd-8e93-2f639da54f50",                        [{"x":[0.0,0.2222222222222222,0.4444444444444444,0.6666666666666666,0.8888888888888888,1.1111111111111112,1.3333333333333333,1.5555555555555554,1.7777777777777777,2.0],"y":[-0.1,-0.05000000000000001,-1.3877787807814457e-17,0.04999999999999999,0.09999999999999998,0.14999999999999997,0.19999999999999998,0.24999999999999997,0.29999999999999993,0.35],"z":[[29.123031979508195,26.96047116185995,24.896675776310463,22.931645822859743,21.065381301507795,19.297882212254603,17.629148555100183,16.059180330044526,14.587977537087637,13.21554017622951],[18.833628650614756,17.11075544680986,15.486647675103727,13.961305335496359,12.534728427987758,11.20691695257792,9.97787090926685,8.847590298054545,7.816075118941006,6.883325371926231],[10.896283606557377,9.613098016595831,8.428677858733053,7.343023132969036,6.356133839303786,5.468009977737301,4.678651548269582,3.9880585509006288,3.3962309856304405,2.903168852459017],[5.310996847336067,4.467498871217872,3.722766327198442,3.0767992152777786,2.5295975354558795,2.0811612877327472,1.7314904721083795,1.4805850885827772,1.3284451371559403,1.275070617827869],[2.0777683729508207,1.6739580106759773,1.3689130804998992,1.1626335824225869,1.05511951644404,1.0463708825642581,1.1363876807832418,1.325169911100991,1.6127175735175057,1.9990306680327863],[1.196598183401639,1.232475434970147,1.3671181186374208,1.6005262344034599,1.9326997822682639,2.3636387622318336,2.8933431742941687,3.521813018455269,4.249048294715135,5.075049003073768],[2.667486278688523,3.1430511441003834,3.717381441611009,4.390477171220399,5.162338332928555,6.032964926735477,7.002356952641165,8.070514410645615,9.237437300748834,10.503125622950817],[6.490432658811471,7.405685138066682,8.419703049420658,9.532486392873402,10.74403516842491,12.054349376075182,13.463429015824218,14.971274087672022,16.57788459161859,18.283260527663924],[12.665437323770481,14.020377416869042,15.474082942066373,17.02655389936246,18.677790288757322,20.427792110250945,22.27655936384334,24.224092049534487,26.27039016732441,28.4154537172131],[21.19250027356557,22.98712798050748,24.880521119548167,26.87267969068761,28.96360369392582,31.153293129262796,33.44174799669854,35.828968296233036,38.314954027866314,40.89970519159835]],"type":"contour"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"title":{"text":"theta0"}},"yaxis":{"title":{"text":"theta1"}},"autosize":false,"width":800,"height":600},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('8508f71c-b407-49fd-8e93-2f639da54f50');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<section id="the-gradient-vector" class="level4" data-number="13.2.4.1">
+<h4 data-number="13.2.4.1" class="anchored" data-anchor-id="the-gradient-vector"><span class="header-section-number">13.2.4.1</span> The Gradient Vector</h4>
+<p>As before, the derivative of the loss function tells us the best way towards the minimum value.</p>
+<p>On a 2D (or higher) surface, the best way to go down (gradient) is described by a <em>vector</em>.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/loss_surface.png" alt="loss_surface" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<blockquote class="blockquote">
+<p>Math Aside: Partial Derivatives</p>
+</blockquote>
+<blockquote class="blockquote">
+<ul>
+<li>For an equation with multiple variables, we take a <strong>partial derivative</strong> by differentiating with respect to just one variable at a time. The partial derivative is denoted with a <span class="math inline">\(\partial\)</span>. Intuitively, we want to see how the function changes if we only vary one variable while holding other variables constant.</li>
+<li>Using <span class="math inline">\(f(x, y) = 3x^2 + y\)</span> as an example,
+<ul>
+<li>taking the partial derivative with respect to x and treating y as a constant gives us <span class="math inline">\(\frac{\partial f}{\partial x} = 6x\)</span></li>
+<li>taking the partial derivative with respect to y and treating x as a constant gives us <span class="math inline">\(\frac{\partial f}{\partial y} = 1\)</span></li>
+</ul></li>
+</ul>
+</blockquote>
+<p>For the <em>vector</em> of parameter values <span class="math inline">\(\vec{\theta} = \begin{bmatrix}
+           \theta_{0} \\
+           \theta_{1} \\
+         \end{bmatrix}\)</span>, we take the <em>partial derivative</em> of loss with respect to each parameter: <span class="math inline">\(\frac{\partial L}{\partial \theta_0}\)</span> and <span class="math inline">\(\frac{\partial L}{\partial \theta_1}\)</span>.</p>
+<blockquote class="blockquote">
+<p>For example, consider the 2D function: <span class="math display">\[f(\theta_0, \theta_1) = 8 \theta_0^2 + 3\theta_0\theta_1\]</span> For a function of 2 variables <span class="math inline">\(f(\theta_0, \theta_1)\)</span>, we define the gradient <span class="math display">\[
+\begin{align}
+\frac{\partial f}{\partial \theta_{0}} &amp;= 16\theta_0 + 3\theta_1 \\
+\frac{\partial f}{\partial \theta_{1}} &amp;= 3\theta_0 \\
+\nabla_{\vec{\theta}} f(\vec{\theta}) &amp;=  \begin{bmatrix} 16\theta_0 + 3\theta_1 \\ 3\theta_0 \\ \end{bmatrix}
+\end{align}
+\]</span></p>
+</blockquote>
+<p>The <strong>gradient vector</strong> of a generic function of <span class="math inline">\(p+1\)</span> variables is therefore <span class="math display">\[\nabla_{\vec{\theta}} L =  \begin{bmatrix} \frac{\partial L}{\partial \theta_0} \\ \frac{\partial L}{\partial \theta_1} \\ \vdots \end{bmatrix}\]</span> where <span class="math inline">\(\nabla_\theta L\)</span> always points in the downhill direction of the surface. We can interpret each gradient as: “If I nudge the <span class="math inline">\(i\)</span>th model weight, what happens to loss?”</p>
+<p>We can use this to update our 1D gradient rule for models with multiple parameters.</p>
+<ul>
+<li><p>Recall our 1D update rule: <span class="math display">\[\theta^{(t+1)} = \theta^{(t)} - \alpha \frac{d}{d\theta}L(\theta^{(t)})\]</span></p></li>
+<li><p>For models with multiple parameters, we work in terms of vectors: <span class="math display">\[\begin{bmatrix}
+         \theta_{0}^{(t+1)} \\
+         \theta_{1}^{(t+1)} \\
+         \vdots
+       \end{bmatrix} = \begin{bmatrix}
+         \theta_{0}^{(t)} \\
+         \theta_{1}^{(t)} \\
+         \vdots
+       \end{bmatrix} - \alpha \begin{bmatrix}
+         \frac{\partial L}{\partial \theta_{0}} \\
+         \frac{\partial L}{\partial \theta_{1}} \\
+         \vdots \\
+       \end{bmatrix}\]</span></p></li>
+<li><p>Written in a more compact form, <span class="math display">\[\vec{\theta}^{(t+1)} = \vec{\theta}^{(t)} - \alpha \nabla_{\vec{\theta}} L(\theta^{(t)}) \]</span></p>
+<ul>
+<li><span class="math inline">\(\theta\)</span> is a vector with our model weights</li>
+<li><span class="math inline">\(L\)</span> is the loss function</li>
+<li><span class="math inline">\(\alpha\)</span> is the learning rate (ours is constant, but other techniques use an <span class="math inline">\(\alpha\)</span> that decreases over time)</li>
+<li><span class="math inline">\(\vec{\theta}^{(t)}\)</span> is the current value of <span class="math inline">\(\theta\)</span></li>
+<li><span class="math inline">\(\vec{\theta}^{(t+1)}\)</span> is the next value of <span class="math inline">\(\theta\)</span></li>
+<li><span class="math inline">\(\nabla_{\vec{\theta}} L(\theta^{(t)})\)</span> is the gradient of the loss function evaluated at the current <span class="math inline">\(\vec{\theta}^{(t)}\)</span></li>
+</ul></li>
+</ul>
+</section>
+</section>
+<section id="batch-gradient-descent-and-stochastic-gradient-descent" class="level3" data-number="13.2.5">
+<h3 data-number="13.2.5" class="anchored" data-anchor-id="batch-gradient-descent-and-stochastic-gradient-descent"><span class="header-section-number">13.2.5</span> Batch Gradient Descent and Stochastic Gradient Descent</h3>
+<p>Formally, the algorithm we derived above is called <strong>batch gradient descent.</strong> For each iteration of the algorithm, the derivative of loss is computed across the <em>entire</em> batch of all <span class="math inline">\(n\)</span> datapoints. While this update rule works well in theory, it is not practical in most circumstances. For large datasets (with perhaps billions of datapoints), finding the gradient across all the data is incredibly computationally taxing; gradient descent will converge slowly because each individual update is slow.</p>
+<p><strong>Stochastic (mini-batch) gradient descent</strong> tries to address this issue. In stochastic descent, only a <em>sample</em> of the full dataset is used at each update. We estimate the true gradient of the loss surface using just that sample of data. The <strong>batch size</strong> is the number of data points used in each sample. The sampling strategy is generally without replacement (data is shuffled and batch size examples are selected one at a time.)</p>
+<p>Each complete “pass” through the data is known as a <strong>training epoch</strong>. After shuffling the data, in a single <strong>training epoch</strong> of stochastic gradient descent, we</p>
+<ul>
+<li>Compute the gradient on the first x% of the data. Update the parameter guesses.</li>
+<li>Compute the gradient on the next x% of the data. Update the parameter guesses.</li>
+<li><span class="math inline">\(\dots\)</span></li>
+<li>Compute the gradient on the last x% of the data. Update the parameter guesses.</li>
+</ul>
+<p>Every data point appears once in a single training epoch. We then perform several training epochs until we’re satisfied.</p>
+<p>Batch gradient descent is a deterministic technique – because the entire dataset is used at each update iteration, the algorithm will always advance towards the minimum of the loss surface. In contrast, stochastic gradient descent involve an element of randomness. Since only a subset of the full data is used to update the guess for <span class="math inline">\(\vec{\theta}\)</span> at each iteration, there’s a chance the algorithm will not progress towards the true minimum of loss with each update. Over the longer term, these stochastic techniques should still converge towards the optimal solution.</p>
+<p>The diagrams below represent a “bird’s eye view” of a loss surface from above. Notice that batch gradient descent takes a direct path towards the optimal <span class="math inline">\(\hat{\theta}\)</span>. Stochastic gradient descent, in contrast, “hops around” on its path to the minimum point on the loss surface. This reflects the randomness of the sampling process at each update step.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/stochastic.png" alt="stochastic" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>To summarize the tradeoffs of batch size:</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 33%">
+<col style="width: 33%">
+<col style="width: 33%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>-</th>
+<th>Smaller Batch Size</th>
+<th>Larger Batch Size</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>Pros</td>
+<td>More frequent gradient updates</td>
+<td>Leverage hardware acceleration to improve overall system performance and higher quality gradient updates</td>
+</tr>
+<tr class="even">
+<td>Cons</td>
+<td>More variability in the gradient estimates</td>
+<td>Less frequent gradient updates</td>
+</tr>
+</tbody>
+</table>
+<p>The typical solution is to set batch size to ensure sufficient hardware utilization.</p>
+
+
+<!-- -->
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../ols/ols.html" class="pagination-link" aria-label="Ordinary Least Squares">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../feature_engineering/feature_engineering.html" class="pagination-link" aria-label="Feature Engineering">
+        <span class="nav-page-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb37" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> sklearn and Gradient Descent</span></span>
+<span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb37-4"><a href="#cb37-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb37-5"><a href="#cb37-5" aria-hidden="true" tabindex="-1"></a><span class="co">  warning: false</span></span>
+<span id="cb37-6"><a href="#cb37-6" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb37-7"><a href="#cb37-7" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb37-8"><a href="#cb37-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: false</span></span>
+<span id="cb37-9"><a href="#cb37-9" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb37-10"><a href="#cb37-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb37-11"><a href="#cb37-11" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: sklearn and Gradient Descent</span></span>
+<span id="cb37-12"><a href="#cb37-12" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb37-13"><a href="#cb37-13" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb37-14"><a href="#cb37-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb37-15"><a href="#cb37-15" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb37-16"><a href="#cb37-16" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb37-17"><a href="#cb37-17" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb37-18"><a href="#cb37-18" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb37-19"><a href="#cb37-19" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb37-20"><a href="#cb37-20" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb37-21"><a href="#cb37-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb37-22"><a href="#cb37-22" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb37-23"><a href="#cb37-23" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb37-24"><a href="#cb37-24" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb37-25"><a href="#cb37-25" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb37-26"><a href="#cb37-26" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb37-27"><a href="#cb37-27" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb37-28"><a href="#cb37-28" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb37-29"><a href="#cb37-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-30"><a href="#cb37-30" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb37-31"><a href="#cb37-31" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb37-32"><a href="#cb37-32" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Apply the <span class="in">`sklearn`</span> library for model creation and training</span>
+<span id="cb37-33"><a href="#cb37-33" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Optimizing complex models </span>
+<span id="cb37-34"><a href="#cb37-34" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Identifying cases where straight calculus or geometric arguments won't help solve the loss function</span>
+<span id="cb37-35"><a href="#cb37-35" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Applying gradient descent for numerical optimization</span>
+<span id="cb37-36"><a href="#cb37-36" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb37-37"><a href="#cb37-37" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-40"><a href="#cb37-40" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-41"><a href="#cb37-41" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb37-42"><a href="#cb37-42" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-43"><a href="#cb37-43" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb37-44"><a href="#cb37-44" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb37-45"><a href="#cb37-45" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.express <span class="im">as</span> px</span>
+<span id="cb37-46"><a href="#cb37-46" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb37-47"><a href="#cb37-47" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb37-48"><a href="#cb37-48" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb37-49"><a href="#cb37-49" aria-hidden="true" tabindex="-1"></a>pd.options.mode.chained_assignment <span class="op">=</span> <span class="va">None</span>  <span class="co"># default='warn'</span></span>
+<span id="cb37-50"><a href="#cb37-50" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-51"><a href="#cb37-51" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-52"><a href="#cb37-52" aria-hidden="true" tabindex="-1"></a><span class="fu">## `sklearn`</span></span>
+<span id="cb37-53"><a href="#cb37-53" aria-hidden="true" tabindex="-1"></a><span class="fu">### Implementing Derived Formulas in Code</span></span>
+<span id="cb37-54"><a href="#cb37-54" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-55"><a href="#cb37-55" aria-hidden="true" tabindex="-1"></a>Throughout this lecture, we'll refer to the <span class="in">`penguins`</span> dataset. </span>
+<span id="cb37-56"><a href="#cb37-56" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-59"><a href="#cb37-59" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-60"><a href="#cb37-60" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb37-61"><a href="#cb37-61" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-62"><a href="#cb37-62" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb37-63"><a href="#cb37-63" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb37-64"><a href="#cb37-64" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb37-65"><a href="#cb37-65" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-66"><a href="#cb37-66" aria-hidden="true" tabindex="-1"></a>penguins <span class="op">=</span> sns.load_dataset(<span class="st">"penguins"</span>)</span>
+<span id="cb37-67"><a href="#cb37-67" aria-hidden="true" tabindex="-1"></a>penguins <span class="op">=</span> penguins[penguins[<span class="st">"species"</span>] <span class="op">==</span> <span class="st">"Adelie"</span>].dropna()</span>
+<span id="cb37-68"><a href="#cb37-68" aria-hidden="true" tabindex="-1"></a>penguins.head()</span>
+<span id="cb37-69"><a href="#cb37-69" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-70"><a href="#cb37-70" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-71"><a href="#cb37-71" aria-hidden="true" tabindex="-1"></a>Our goal will be to predict the value of the <span class="in">`"bill_depth_mm"`</span> for a particular penguin given its <span class="in">`"flipper_length_mm"`</span> and <span class="in">`"body_mass_g"`</span>. We'll also add a bias column of all ones to represent the intercept term of our models.</span>
+<span id="cb37-72"><a href="#cb37-72" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-75"><a href="#cb37-75" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-76"><a href="#cb37-76" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-77"><a href="#cb37-77" aria-hidden="true" tabindex="-1"></a><span class="co"># Add a bias column of all ones to `penguins`</span></span>
+<span id="cb37-78"><a href="#cb37-78" aria-hidden="true" tabindex="-1"></a>penguins[<span class="st">"bias"</span>] <span class="op">=</span> np.ones(<span class="bu">len</span>(penguins), dtype<span class="op">=</span><span class="bu">int</span>) </span>
+<span id="cb37-79"><a href="#cb37-79" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-80"><a href="#cb37-80" aria-hidden="true" tabindex="-1"></a><span class="co"># Define the design matrix, X...</span></span>
+<span id="cb37-81"><a href="#cb37-81" aria-hidden="true" tabindex="-1"></a><span class="co"># Note that we use .to_numpy() to convert our DataFrame into a NumPy array so it is in Matrix form</span></span>
+<span id="cb37-82"><a href="#cb37-82" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> penguins[[<span class="st">"bias"</span>, <span class="st">"flipper_length_mm"</span>, <span class="st">"body_mass_g"</span>]].to_numpy()</span>
+<span id="cb37-83"><a href="#cb37-83" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-84"><a href="#cb37-84" aria-hidden="true" tabindex="-1"></a><span class="co"># ...as well as the target variable, Y</span></span>
+<span id="cb37-85"><a href="#cb37-85" aria-hidden="true" tabindex="-1"></a><span class="co"># Again, we use .to_numpy() to convert our DataFrame into a NumPy array so it is in Matrix form</span></span>
+<span id="cb37-86"><a href="#cb37-86" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> penguins[[<span class="st">"bill_depth_mm"</span>]].to_numpy()</span>
+<span id="cb37-87"><a href="#cb37-87" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-88"><a href="#cb37-88" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-89"><a href="#cb37-89" aria-hidden="true" tabindex="-1"></a>In the lecture on ordinary least squares, we expressed multiple linear regression using matrix notation.</span>
+<span id="cb37-90"><a href="#cb37-90" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-91"><a href="#cb37-91" aria-hidden="true" tabindex="-1"></a>$$\hat{\mathbb{Y}} = \mathbb{X}\theta$$</span>
+<span id="cb37-92"><a href="#cb37-92" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-93"><a href="#cb37-93" aria-hidden="true" tabindex="-1"></a>We used a geometric approach to derive the following expression for the optimal model parameters:</span>
+<span id="cb37-94"><a href="#cb37-94" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-95"><a href="#cb37-95" aria-hidden="true" tabindex="-1"></a>$$\hat{\theta} = (\mathbb{X}^T \mathbb{X})^{-1}\mathbb{X}^T \mathbb{Y}$$</span>
+<span id="cb37-96"><a href="#cb37-96" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-97"><a href="#cb37-97" aria-hidden="true" tabindex="-1"></a>That's a whole lot of matrix manipulation. How do we implement it in <span class="in">`python`</span>?</span>
+<span id="cb37-98"><a href="#cb37-98" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-99"><a href="#cb37-99" aria-hidden="true" tabindex="-1"></a>There are three operations we need to perform here: multiplying matrices, taking transposes, and finding inverses. </span>
+<span id="cb37-100"><a href="#cb37-100" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-101"><a href="#cb37-101" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>To perform matrix multiplication, use the <span class="in">`@`</span> operator</span>
+<span id="cb37-102"><a href="#cb37-102" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>To take a transpose, call the <span class="in">`.T`</span> attribute of an <span class="in">`NumPy`</span> array or <span class="in">`DataFrame`</span></span>
+<span id="cb37-103"><a href="#cb37-103" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>To compute an inverse, use <span class="in">`NumPy`</span>'s in-built method <span class="in">`np.linalg.inv`</span></span>
+<span id="cb37-104"><a href="#cb37-104" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-105"><a href="#cb37-105" aria-hidden="true" tabindex="-1"></a>Putting this all together, we can compute the OLS estimate for the optimal model parameters, stored in the array <span class="in">`theta_hat`</span>.</span>
+<span id="cb37-106"><a href="#cb37-106" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-109"><a href="#cb37-109" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-110"><a href="#cb37-110" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb37-111"><a href="#cb37-111" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-112"><a href="#cb37-112" aria-hidden="true" tabindex="-1"></a>theta_hat <span class="op">=</span> np.linalg.inv(X.T <span class="op">@</span> X) <span class="op">@</span> X.T <span class="op">@</span> Y</span>
+<span id="cb37-113"><a href="#cb37-113" aria-hidden="true" tabindex="-1"></a>theta_hat</span>
+<span id="cb37-114"><a href="#cb37-114" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-115"><a href="#cb37-115" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-116"><a href="#cb37-116" aria-hidden="true" tabindex="-1"></a>To make predictions using our optimized parameter values, we matrix-multiply the design matrix with the parameter vector:</span>
+<span id="cb37-117"><a href="#cb37-117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-118"><a href="#cb37-118" aria-hidden="true" tabindex="-1"></a>$$\hat{\mathbb{Y}} = \mathbb{X}\theta$$</span>
+<span id="cb37-119"><a href="#cb37-119" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-122"><a href="#cb37-122" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-123"><a href="#cb37-123" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb37-124"><a href="#cb37-124" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-125"><a href="#cb37-125" aria-hidden="true" tabindex="-1"></a>Y_hat <span class="op">=</span> X <span class="op">@</span> theta_hat</span>
+<span id="cb37-126"><a href="#cb37-126" aria-hidden="true" tabindex="-1"></a>pd.DataFrame(Y_hat).head()</span>
+<span id="cb37-127"><a href="#cb37-127" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-128"><a href="#cb37-128" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-129"><a href="#cb37-129" aria-hidden="true" tabindex="-1"></a><span class="fu">### The `sklearn` Workflow</span></span>
+<span id="cb37-130"><a href="#cb37-130" aria-hidden="true" tabindex="-1"></a>We've already saved a lot of time (and avoided tedious calculations) by translating our derived formulas into code. However, we still had to go through the process of writing out the linear algebra ourselves. </span>
+<span id="cb37-131"><a href="#cb37-131" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-132"><a href="#cb37-132" aria-hidden="true" tabindex="-1"></a>To make life *even easier*, we can turn to the <span class="in">`sklearn`</span> <span class="co">[</span><span class="ot">`python` library</span><span class="co">](https://scikit-learn.org/stable/)</span>. <span class="in">`sklearn`</span> is a robust library of machine learning tools used extensively in research and industry. It is the standard for simple machine learning tasks and gives us a wide variety of in-built modeling frameworks and methods, so we'll keep returning to <span class="in">`sklearn`</span> techniques as we progress through Data 100. </span>
+<span id="cb37-133"><a href="#cb37-133" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-134"><a href="#cb37-134" aria-hidden="true" tabindex="-1"></a>Regardless of the specific type of model being implemented, <span class="in">`sklearn`</span> follows a standard set of steps for creating a model: </span>
+<span id="cb37-135"><a href="#cb37-135" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-136"><a href="#cb37-136" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Import the <span class="in">`LinearRegression`</span> model from <span class="in">`sklearn`</span></span>
+<span id="cb37-137"><a href="#cb37-137" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-138"><a href="#cb37-138" aria-hidden="true" tabindex="-1"></a>    <span class="in">```</span></span>
+<span id="cb37-139"><a href="#cb37-139" aria-hidden="true" tabindex="-1"></a><span class="in">    from sklearn.linear_model import LinearRegression</span></span>
+<span id="cb37-140"><a href="#cb37-140" aria-hidden="true" tabindex="-1"></a><span class="in">    ```</span></span>
+<span id="cb37-141"><a href="#cb37-141" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-142"><a href="#cb37-142" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Create a model object. This generates a new instance of the model class. You can think of it as making a new "copy" of a standard "template" for a model. In code, this looks like:</span>
+<span id="cb37-143"><a href="#cb37-143" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-144"><a href="#cb37-144" aria-hidden="true" tabindex="-1"></a>    <span class="in">```</span></span>
+<span id="cb37-145"><a href="#cb37-145" aria-hidden="true" tabindex="-1"></a><span class="in">    my_model = LinearRegression()</span></span>
+<span id="cb37-146"><a href="#cb37-146" aria-hidden="true" tabindex="-1"></a><span class="in">    ```</span></span>
+<span id="cb37-147"><a href="#cb37-147" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-148"><a href="#cb37-148" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb37-149"><a href="#cb37-149" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Fit the model to the <span class="in">`X`</span> design matrix and <span class="in">`Y`</span> target vector. This calculates the optimal model parameters "behind the scenes" without us explicitly working through the calculations ourselves. The fitted parameters are then stored within the model for use in future predictions:</span>
+<span id="cb37-150"><a href="#cb37-150" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-151"><a href="#cb37-151" aria-hidden="true" tabindex="-1"></a>    <span class="in">```</span></span>
+<span id="cb37-152"><a href="#cb37-152" aria-hidden="true" tabindex="-1"></a><span class="in">    my_model.fit(X, Y)</span></span>
+<span id="cb37-153"><a href="#cb37-153" aria-hidden="true" tabindex="-1"></a><span class="in">     ```</span></span>
+<span id="cb37-154"><a href="#cb37-154" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-155"><a href="#cb37-155" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb37-156"><a href="#cb37-156" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>Use the fitted model to make predictions on the <span class="in">`X`</span> input data using <span class="in">`.predict`</span>. </span>
+<span id="cb37-157"><a href="#cb37-157" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-158"><a href="#cb37-158" aria-hidden="true" tabindex="-1"></a>    <span class="in">```</span></span>
+<span id="cb37-159"><a href="#cb37-159" aria-hidden="true" tabindex="-1"></a><span class="in">    my_model.predict(X)</span></span>
+<span id="cb37-160"><a href="#cb37-160" aria-hidden="true" tabindex="-1"></a><span class="in">    ```</span></span>
+<span id="cb37-161"><a href="#cb37-161" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-162"><a href="#cb37-162" aria-hidden="true" tabindex="-1"></a>To extract the fitted parameters, we can use:</span>
+<span id="cb37-163"><a href="#cb37-163" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-164"><a href="#cb37-164" aria-hidden="true" tabindex="-1"></a>  <span class="in">```</span></span>
+<span id="cb37-165"><a href="#cb37-165" aria-hidden="true" tabindex="-1"></a><span class="in">  my_model.coef_</span></span>
+<span id="cb37-166"><a href="#cb37-166" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-167"><a href="#cb37-167" aria-hidden="true" tabindex="-1"></a><span class="in">  my_model.intercept_</span></span>
+<span id="cb37-168"><a href="#cb37-168" aria-hidden="true" tabindex="-1"></a><span class="in">  ```</span></span>
+<span id="cb37-169"><a href="#cb37-169" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-170"><a href="#cb37-170" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-171"><a href="#cb37-171" aria-hidden="true" tabindex="-1"></a>Let's put this into action with our multiple regression task!</span>
+<span id="cb37-172"><a href="#cb37-172" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-173"><a href="#cb37-173" aria-hidden="true" tabindex="-1"></a>**1. Initialize an instance of the model class**</span>
+<span id="cb37-174"><a href="#cb37-174" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-175"><a href="#cb37-175" aria-hidden="true" tabindex="-1"></a><span class="in">`sklearn`</span> stores "templates" of useful models for machine learning. We begin the modeling process by making a "copy" of one of these templates for our own use. Model initialization looks like <span class="in">`ModelClass()`</span>, where <span class="in">`ModelClass`</span> is the type of model we wish to create.</span>
+<span id="cb37-176"><a href="#cb37-176" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-177"><a href="#cb37-177" aria-hidden="true" tabindex="-1"></a>For now, let's create a linear regression model using <span class="in">`LinearRegression`</span>. </span>
+<span id="cb37-178"><a href="#cb37-178" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-179"><a href="#cb37-179" aria-hidden="true" tabindex="-1"></a><span class="in">`my_model`</span> is now an instance of the <span class="in">`LinearRegression`</span> class. You can think of it as the "idea" of a linear regression model. We haven't trained it yet, so it doesn't know any model parameters and cannot be used to make predictions. In fact, we haven't even told it what data to use for modeling! It simply waits for further instructions.</span>
+<span id="cb37-180"><a href="#cb37-180" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-183"><a href="#cb37-183" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-184"><a href="#cb37-184" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-185"><a href="#cb37-185" aria-hidden="true" tabindex="-1"></a>my_model <span class="op">=</span> LinearRegression()</span>
+<span id="cb37-186"><a href="#cb37-186" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-187"><a href="#cb37-187" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-188"><a href="#cb37-188" aria-hidden="true" tabindex="-1"></a>**2. Train the model using `.fit`**</span>
+<span id="cb37-189"><a href="#cb37-189" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-190"><a href="#cb37-190" aria-hidden="true" tabindex="-1"></a>Before the model can make predictions, we will need to fit it to our training data. When we fit the model, <span class="in">`sklearn`</span> will run gradient descent behind the scenes to determine the optimal model parameters. It will then save these model parameters to our model instance for future use. </span>
+<span id="cb37-191"><a href="#cb37-191" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-192"><a href="#cb37-192" aria-hidden="true" tabindex="-1"></a>All <span class="in">`sklearn`</span> model classes include a <span class="in">`.fit`</span> method, which is used to fit the model. It takes in two inputs: the design matrix, <span class="in">`X`</span>, and the target variable, <span class="in">`Y`</span>. </span>
+<span id="cb37-193"><a href="#cb37-193" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-194"><a href="#cb37-194" aria-hidden="true" tabindex="-1"></a>Let's start by fitting a model with just one feature: the flipper length. We create a design matrix <span class="in">`X`</span> by pulling out the <span class="in">`"flipper_length_mm"`</span> column from the <span class="in">`DataFrame`</span>. </span>
+<span id="cb37-195"><a href="#cb37-195" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-198"><a href="#cb37-198" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-199"><a href="#cb37-199" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-200"><a href="#cb37-200" aria-hidden="true" tabindex="-1"></a><span class="co"># .fit expects a 2D data design matrix, so we use double brackets to extract a DataFrame</span></span>
+<span id="cb37-201"><a href="#cb37-201" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> penguins[[<span class="st">"flipper_length_mm"</span>]]</span>
+<span id="cb37-202"><a href="#cb37-202" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> penguins[<span class="st">"bill_depth_mm"</span>]</span>
+<span id="cb37-203"><a href="#cb37-203" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-204"><a href="#cb37-204" aria-hidden="true" tabindex="-1"></a>my_model.fit(X, Y)</span>
+<span id="cb37-205"><a href="#cb37-205" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-206"><a href="#cb37-206" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-207"><a href="#cb37-207" aria-hidden="true" tabindex="-1"></a>Notice that we use **double brackets** to extract this column. Why double brackets instead of just single brackets? The `.fit` method, by default, expects to receive **2-dimensional** data – some kind of data that includes both rows and columns. Writing <span class="in">`penguins["flipper_length_mm"]`</span> would return a 1D <span class="in">`Series`</span>, causing <span class="in">`sklearn`</span> to error. We avoid this by writing <span class="in">`penguins[["flipper_length_mm"]]`</span> to produce a 2D <span class="in">`DataFrame`</span>. </span>
+<span id="cb37-208"><a href="#cb37-208" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-209"><a href="#cb37-209" aria-hidden="true" tabindex="-1"></a>And in just three lines of code, our model has run gradient descent to determine the optimal model parameters! Our single-feature model takes the form:</span>
+<span id="cb37-210"><a href="#cb37-210" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-211"><a href="#cb37-211" aria-hidden="true" tabindex="-1"></a>$$\text{bill depth} = \theta_0 + \theta_1 \text{flipper length}$$</span>
+<span id="cb37-212"><a href="#cb37-212" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-213"><a href="#cb37-213" aria-hidden="true" tabindex="-1"></a>Note that <span class="in">`LinearRegression`</span> will automatically include an intercept term. </span>
+<span id="cb37-214"><a href="#cb37-214" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-215"><a href="#cb37-215" aria-hidden="true" tabindex="-1"></a>The fitted model parameters are stored as attributes of the model instance. <span class="in">`my_model.intercept_`</span> will return the value of $\hat{\theta}_0$ as a scalar. `my_model.coef_` will return all values $\hat{\theta}_1, </span>
+<span id="cb37-216"><a href="#cb37-216" aria-hidden="true" tabindex="-1"></a>\hat{\theta}_1, ...$ in an array. Because our model only contains one feature, we see just the value of $\hat{\theta}_1$ in the cell below.</span>
+<span id="cb37-217"><a href="#cb37-217" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-220"><a href="#cb37-220" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-221"><a href="#cb37-221" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-222"><a href="#cb37-222" aria-hidden="true" tabindex="-1"></a><span class="co"># The intercept term, theta_0</span></span>
+<span id="cb37-223"><a href="#cb37-223" aria-hidden="true" tabindex="-1"></a>my_model.intercept_</span>
+<span id="cb37-224"><a href="#cb37-224" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-225"><a href="#cb37-225" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-228"><a href="#cb37-228" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-229"><a href="#cb37-229" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-230"><a href="#cb37-230" aria-hidden="true" tabindex="-1"></a><span class="co"># All parameters theta_1, ..., theta_p</span></span>
+<span id="cb37-231"><a href="#cb37-231" aria-hidden="true" tabindex="-1"></a>my_model.coef_</span>
+<span id="cb37-232"><a href="#cb37-232" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-233"><a href="#cb37-233" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-234"><a href="#cb37-234" aria-hidden="true" tabindex="-1"></a>**3. Use the fitted model to make predictions**</span>
+<span id="cb37-235"><a href="#cb37-235" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-236"><a href="#cb37-236" aria-hidden="true" tabindex="-1"></a>Now that the model has been trained, we can use it to make predictions! To do so, we use the <span class="in">`.predict`</span> method. <span class="in">`.predict`</span> takes in one argument: the design matrix that should be used to generate predictions. To understand how the model performs on the training set, we would pass in the training data. Alternatively, to make predictions on unseen data, we would pass in a new dataset that wasn't used to train the model.</span>
+<span id="cb37-237"><a href="#cb37-237" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-238"><a href="#cb37-238" aria-hidden="true" tabindex="-1"></a>Below, we call <span class="in">`.predict`</span> to generate model predictions on the original training data. As before, we use double brackets to ensure that we extract 2-dimensional data.</span>
+<span id="cb37-239"><a href="#cb37-239" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-242"><a href="#cb37-242" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-243"><a href="#cb37-243" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-244"><a href="#cb37-244" aria-hidden="true" tabindex="-1"></a>Y_hat_one_feature <span class="op">=</span> my_model.predict(penguins[[<span class="st">"flipper_length_mm"</span>]])</span>
+<span id="cb37-245"><a href="#cb37-245" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-246"><a href="#cb37-246" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"The RMSE of the model is </span><span class="sc">{</span>np<span class="sc">.</span>sqrt(np.mean((Y<span class="op">-</span>Y_hat_one_feature)<span class="op">**</span><span class="dv">2</span>))<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb37-247"><a href="#cb37-247" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-248"><a href="#cb37-248" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-249"><a href="#cb37-249" aria-hidden="true" tabindex="-1"></a>What if we wanted a model with two features? </span>
+<span id="cb37-250"><a href="#cb37-250" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-251"><a href="#cb37-251" aria-hidden="true" tabindex="-1"></a>$$\text{bill depth} = \theta_0 + \theta_1 \text{flipper length} + \theta_2 \text{body mass}$$</span>
+<span id="cb37-252"><a href="#cb37-252" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-253"><a href="#cb37-253" aria-hidden="true" tabindex="-1"></a>We repeat this three-step process by intializing a new model object, then calling <span class="in">`.fit`</span> and <span class="in">`.predict`</span> as before.</span>
+<span id="cb37-254"><a href="#cb37-254" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-257"><a href="#cb37-257" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-258"><a href="#cb37-258" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-259"><a href="#cb37-259" aria-hidden="true" tabindex="-1"></a><span class="co"># Step 1: initialize LinearRegression model</span></span>
+<span id="cb37-260"><a href="#cb37-260" aria-hidden="true" tabindex="-1"></a>two_feature_model <span class="op">=</span> LinearRegression()</span>
+<span id="cb37-261"><a href="#cb37-261" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-262"><a href="#cb37-262" aria-hidden="true" tabindex="-1"></a><span class="co"># Step 2: fit the model</span></span>
+<span id="cb37-263"><a href="#cb37-263" aria-hidden="true" tabindex="-1"></a>X_two_features <span class="op">=</span> penguins[[<span class="st">"flipper_length_mm"</span>, <span class="st">"body_mass_g"</span>]]</span>
+<span id="cb37-264"><a href="#cb37-264" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> penguins[<span class="st">"bill_depth_mm"</span>]</span>
+<span id="cb37-265"><a href="#cb37-265" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-266"><a href="#cb37-266" aria-hidden="true" tabindex="-1"></a>two_feature_model.fit(X_two_features, Y)</span>
+<span id="cb37-267"><a href="#cb37-267" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-268"><a href="#cb37-268" aria-hidden="true" tabindex="-1"></a><span class="co"># Step 3: make predictions</span></span>
+<span id="cb37-269"><a href="#cb37-269" aria-hidden="true" tabindex="-1"></a>Y_hat_two_features <span class="op">=</span> two_feature_model.predict(X_two_features)</span>
+<span id="cb37-270"><a href="#cb37-270" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-271"><a href="#cb37-271" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"The RMSE of the model is </span><span class="sc">{</span>np<span class="sc">.</span>sqrt(np.mean((Y<span class="op">-</span>Y_hat_two_features)<span class="op">**</span><span class="dv">2</span>))<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb37-272"><a href="#cb37-272" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-273"><a href="#cb37-273" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-274"><a href="#cb37-274" aria-hidden="true" tabindex="-1"></a>We can also see that we obtain the same predictions using <span class="in">`sklearn`</span> as we did when applying the ordinary least squares formula before! </span>
+<span id="cb37-275"><a href="#cb37-275" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-278"><a href="#cb37-278" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-279"><a href="#cb37-279" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb37-280"><a href="#cb37-280" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-281"><a href="#cb37-281" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"Y_hat from OLS"</span>:np.squeeze(Y_hat), <span class="st">"Y_hat from sklearn"</span>:Y_hat_two_features}).head()</span>
+<span id="cb37-282"><a href="#cb37-282" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-283"><a href="#cb37-283" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-284"><a href="#cb37-284" aria-hidden="true" tabindex="-1"></a><span class="fu">## Gradient Descent </span></span>
+<span id="cb37-285"><a href="#cb37-285" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-286"><a href="#cb37-286" aria-hidden="true" tabindex="-1"></a>At this point, we've grown quite familiar with the process of choosing a model and a corresponding loss function and optimizing parameters by choosing the values of $\theta$ that minimize the loss function. So far, we've optimized $\theta$ by</span>
+<span id="cb37-287"><a href="#cb37-287" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-288"><a href="#cb37-288" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Using calculus to take the derivative of the loss function with respect to $\theta$, setting it equal to 0, and solving for $\theta$.</span>
+<span id="cb37-289"><a href="#cb37-289" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Using the geometric argument of orthogonality to derive the OLS solution $\hat{\theta} = (\mathbb{X}^T \mathbb{X})^{-1}\mathbb{X}^T \mathbb{Y}$.</span>
+<span id="cb37-290"><a href="#cb37-290" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-291"><a href="#cb37-291" aria-hidden="true" tabindex="-1"></a>One thing to note, however, is that the techniques we used above can only be applied if we make some big assumptions. For the calculus approach, we assumed that the loss function was differentiable at all points and that we could algebraically solve for the zero points of the derivative; for the geometric approach, OLS *only* applies when using a linear model with MSE loss. What happens when we have more complex models with different, more complex loss functions? The techniques we've learned so far will not work, so we need a new optimization technique: **gradient descent**. </span>
+<span id="cb37-292"><a href="#cb37-292" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-293"><a href="#cb37-293" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; **BIG IDEA**: use an iterative algorithm to numerically compute the minimum of the loss.</span></span>
+<span id="cb37-294"><a href="#cb37-294" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-295"><a href="#cb37-295" aria-hidden="true" tabindex="-1"></a><span class="fu">### Minimizing an Arbitrary 1D Function</span></span>
+<span id="cb37-296"><a href="#cb37-296" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-297"><a href="#cb37-297" aria-hidden="true" tabindex="-1"></a>Let's consider an arbitrary function. Our goal is to find the value of $x$ that minimizes this function.</span>
+<span id="cb37-298"><a href="#cb37-298" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-301"><a href="#cb37-301" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-302"><a href="#cb37-302" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-303"><a href="#cb37-303" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> arbitrary(x):</span>
+<span id="cb37-304"><a href="#cb37-304" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (x<span class="op">**</span><span class="dv">4</span> <span class="op">-</span> <span class="dv">15</span><span class="op">*</span>x<span class="op">**</span><span class="dv">3</span> <span class="op">+</span> <span class="dv">80</span><span class="op">*</span>x<span class="op">**</span><span class="dv">2</span> <span class="op">-</span> <span class="dv">180</span><span class="op">*</span>x <span class="op">+</span> <span class="dv">144</span>)<span class="op">/</span><span class="dv">10</span></span>
+<span id="cb37-305"><a href="#cb37-305" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-306"><a href="#cb37-306" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-307"><a href="#cb37-307" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/arbitrary.png" alt='arbitrary' width='600'&gt;</span>
+<span id="cb37-308"><a href="#cb37-308" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-309"><a href="#cb37-309" aria-hidden="true" tabindex="-1"></a><span class="fu">#### The Naive Approach: Guess and Check</span></span>
+<span id="cb37-310"><a href="#cb37-310" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-311"><a href="#cb37-311" aria-hidden="true" tabindex="-1"></a>Above, we saw that the minimum is somewhere around 5.3. Let's see if we can figure out how to find the exact minimum algorithmically from scratch. One very slow (and terrible) way would be manual guess-and-check.</span>
+<span id="cb37-312"><a href="#cb37-312" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-315"><a href="#cb37-315" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-316"><a href="#cb37-316" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-317"><a href="#cb37-317" aria-hidden="true" tabindex="-1"></a>arbitrary(<span class="dv">6</span>)</span>
+<span id="cb37-318"><a href="#cb37-318" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-319"><a href="#cb37-319" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-320"><a href="#cb37-320" aria-hidden="true" tabindex="-1"></a>A somewhat better (but still slow) approach is to use brute force to try out a bunch of x values and return the one that yields the lowest loss.</span>
+<span id="cb37-321"><a href="#cb37-321" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-324"><a href="#cb37-324" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-325"><a href="#cb37-325" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-326"><a href="#cb37-326" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> simple_minimize(f, xs):</span>
+<span id="cb37-327"><a href="#cb37-327" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Takes in a function f and a set of values xs. </span></span>
+<span id="cb37-328"><a href="#cb37-328" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Calculates the value of the function f at all values x in xs</span></span>
+<span id="cb37-329"><a href="#cb37-329" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Takes the minimum value of f(x) and returns the corresponding value x </span></span>
+<span id="cb37-330"><a href="#cb37-330" aria-hidden="true" tabindex="-1"></a>    y <span class="op">=</span> [f(x) <span class="cf">for</span> x <span class="kw">in</span> xs]  </span>
+<span id="cb37-331"><a href="#cb37-331" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> xs[np.argmin(y)]</span>
+<span id="cb37-332"><a href="#cb37-332" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-333"><a href="#cb37-333" aria-hidden="true" tabindex="-1"></a>guesses <span class="op">=</span> [<span class="fl">5.3</span>, <span class="fl">5.31</span>, <span class="fl">5.32</span>, <span class="fl">5.33</span>, <span class="fl">5.34</span>, <span class="fl">5.35</span>]</span>
+<span id="cb37-334"><a href="#cb37-334" aria-hidden="true" tabindex="-1"></a>simple_minimize(arbitrary, guesses)</span>
+<span id="cb37-335"><a href="#cb37-335" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-336"><a href="#cb37-336" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-337"><a href="#cb37-337" aria-hidden="true" tabindex="-1"></a>This process is essentially the same as before where we made a graphical plot, it's just that we're only looking at 20 selected points.</span>
+<span id="cb37-338"><a href="#cb37-338" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-341"><a href="#cb37-341" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-342"><a href="#cb37-342" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb37-343"><a href="#cb37-343" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-344"><a href="#cb37-344" aria-hidden="true" tabindex="-1"></a>xs <span class="op">=</span> np.linspace(<span class="dv">1</span>, <span class="dv">7</span>, <span class="dv">200</span>)</span>
+<span id="cb37-345"><a href="#cb37-345" aria-hidden="true" tabindex="-1"></a>sparse_xs <span class="op">=</span> np.linspace(<span class="dv">1</span>, <span class="dv">7</span>, <span class="dv">5</span>)</span>
+<span id="cb37-346"><a href="#cb37-346" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-347"><a href="#cb37-347" aria-hidden="true" tabindex="-1"></a>ys <span class="op">=</span> arbitrary(xs)</span>
+<span id="cb37-348"><a href="#cb37-348" aria-hidden="true" tabindex="-1"></a>sparse_ys <span class="op">=</span> arbitrary(sparse_xs)</span>
+<span id="cb37-349"><a href="#cb37-349" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-350"><a href="#cb37-350" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.line(x <span class="op">=</span> xs, y <span class="op">=</span> arbitrary(xs))</span>
+<span id="cb37-351"><a href="#cb37-351" aria-hidden="true" tabindex="-1"></a>fig.add_scatter(x <span class="op">=</span> sparse_xs, y <span class="op">=</span> arbitrary(sparse_xs), mode <span class="op">=</span> <span class="st">"markers"</span>)</span>
+<span id="cb37-352"><a href="#cb37-352" aria-hidden="true" tabindex="-1"></a>fig.update_layout(showlegend<span class="op">=</span> <span class="va">False</span>)</span>
+<span id="cb37-353"><a href="#cb37-353" aria-hidden="true" tabindex="-1"></a>fig.update_layout(autosize<span class="op">=</span><span class="va">False</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>)</span>
+<span id="cb37-354"><a href="#cb37-354" aria-hidden="true" tabindex="-1"></a>fig.show()</span>
+<span id="cb37-355"><a href="#cb37-355" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-356"><a href="#cb37-356" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-357"><a href="#cb37-357" aria-hidden="true" tabindex="-1"></a>This basic approach suffers from three major flaws:</span>
+<span id="cb37-358"><a href="#cb37-358" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-359"><a href="#cb37-359" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>If the minimum is outside our range of guesses, the answer will be completely wrong.</span>
+<span id="cb37-360"><a href="#cb37-360" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Even if our range of guesses is correct, if the guesses are too coarse, our answer will be inaccurate.</span>
+<span id="cb37-361"><a href="#cb37-361" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>It is *very* computationally inefficient, considering potentially vast numbers of guesses that are useless.</span>
+<span id="cb37-362"><a href="#cb37-362" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-363"><a href="#cb37-363" aria-hidden="true" tabindex="-1"></a><span class="fu">#### `Scipy.optimize.minimize`</span></span>
+<span id="cb37-364"><a href="#cb37-364" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-365"><a href="#cb37-365" aria-hidden="true" tabindex="-1"></a>One way to minimize this mathematical function is to use the <span class="in">`scipy.optimize.minimize`</span> function. It takes a function and a starting guess and tries to find the minimum.</span>
+<span id="cb37-366"><a href="#cb37-366" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-369"><a href="#cb37-369" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-370"><a href="#cb37-370" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-371"><a href="#cb37-371" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.optimize <span class="im">import</span> minimize</span>
+<span id="cb37-372"><a href="#cb37-372" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-373"><a href="#cb37-373" aria-hidden="true" tabindex="-1"></a><span class="co"># takes a function f and a starting point x0 and returns a readout </span></span>
+<span id="cb37-374"><a href="#cb37-374" aria-hidden="true" tabindex="-1"></a><span class="co"># with the optimal input value of x which minimizes f</span></span>
+<span id="cb37-375"><a href="#cb37-375" aria-hidden="true" tabindex="-1"></a>minimize(arbitrary, x0 <span class="op">=</span> <span class="fl">3.5</span>)</span>
+<span id="cb37-376"><a href="#cb37-376" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-377"><a href="#cb37-377" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-378"><a href="#cb37-378" aria-hidden="true" tabindex="-1"></a><span class="in">`scipy.optimize.minimize`</span> is great. It may also seem a bit magical. How could you write a function that can find the minimum of any mathematical function? There are a number of ways to do this, which we'll explore in today's lecture, eventually arriving at the important idea of **gradient descent**, which is the principle that <span class="in">`scipy.optimize.minimize`</span> uses.</span>
+<span id="cb37-379"><a href="#cb37-379" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-380"><a href="#cb37-380" aria-hidden="true" tabindex="-1"></a>It turns out that under the hood, the <span class="in">`fit`</span> method for <span class="in">`LinearRegression`</span> models uses gradient descent. Gradient descent is also how much of machine learning works, including even advanced neural network models. </span>
+<span id="cb37-381"><a href="#cb37-381" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-382"><a href="#cb37-382" aria-hidden="true" tabindex="-1"></a>In Data 100, the gradient descent process will usually be invisible to us, hidden beneath an abstraction layer. However, to be good data scientists, it's important that we know the underlying principles that optimization functions harness to find optimal parameters.</span>
+<span id="cb37-383"><a href="#cb37-383" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-384"><a href="#cb37-384" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-385"><a href="#cb37-385" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Digging into Gradient Descent</span></span>
+<span id="cb37-386"><a href="#cb37-386" aria-hidden="true" tabindex="-1"></a>Looking at the function across this domain, it is clear that the function's minimum value occurs around $\theta = 5.3$. Let's pretend for a moment that we *couldn't* see the full view of the cost function. How would we guess the value of $\theta$ that minimizes the function? </span>
+<span id="cb37-387"><a href="#cb37-387" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-388"><a href="#cb37-388" aria-hidden="true" tabindex="-1"></a>It turns out that the first derivative of the function can give us a clue. In the plots below, the line indicates the value of the derivative of each value of $\theta$. The derivative is negative where it is red and positive where it is green.</span>
+<span id="cb37-389"><a href="#cb37-389" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-390"><a href="#cb37-390" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-391"><a href="#cb37-391" aria-hidden="true" tabindex="-1"></a>Say we make a guess for the minimizing value of $\theta$. Remember that we read plots from left to right, and assume that our starting $\theta$ value is to the left of the optimal $\hat{\theta}$. If the guess "undershoots" the true minimizing value – our guess for $\theta$ is lower than the value of the $\hat{\theta}$ that minimizes the function – the derivative will be **negative**. This means that if we increase $\theta$ (move further to the right), then we **can decrease** our loss function further. If this guess "overshoots" the true minimizing value, the derivative will be positive, implying the converse.</span>
+<span id="cb37-392"><a href="#cb37-392" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-393"><a href="#cb37-393" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-394"><a href="#cb37-394" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-395"><a href="#cb37-395" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-396"><a href="#cb37-396" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/step.png" alt='step' width='600'&gt;</span>
+<span id="cb37-397"><a href="#cb37-397" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-398"><a href="#cb37-398" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-399"><a href="#cb37-399" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-400"><a href="#cb37-400" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-401"><a href="#cb37-401" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-402"><a href="#cb37-402" aria-hidden="true" tabindex="-1"></a>We can use this pattern to help formulate our next guess for the optimal $\hat{\theta}$. Consider the case where we've undershot $\theta$ by guessing too low of a value. We'll want our next guess to be greater in value than our previous guess – that is, we want to shift our guess to the right. You can think of this as following the slope "downhill" to the function's minimum value.</span>
+<span id="cb37-403"><a href="#cb37-403" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-404"><a href="#cb37-404" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-405"><a href="#cb37-405" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-406"><a href="#cb37-406" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-407"><a href="#cb37-407" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/neg_step.png" alt='neg_step' width='600'&gt;</span>
+<span id="cb37-408"><a href="#cb37-408" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-409"><a href="#cb37-409" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-410"><a href="#cb37-410" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-411"><a href="#cb37-411" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-412"><a href="#cb37-412" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-413"><a href="#cb37-413" aria-hidden="true" tabindex="-1"></a>If we've overshot $\hat{\theta}$ by guessing too high of a value, we'll want our next guess to be lower in value – we want to shift our guess for $\hat{\theta}$ to the left. </span>
+<span id="cb37-414"><a href="#cb37-414" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-415"><a href="#cb37-415" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-416"><a href="#cb37-416" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-417"><a href="#cb37-417" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-418"><a href="#cb37-418" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/pos_step.png" alt='pos_step' width='600'&gt;</span>
+<span id="cb37-419"><a href="#cb37-419" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-420"><a href="#cb37-420" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-421"><a href="#cb37-421" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-422"><a href="#cb37-422" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-423"><a href="#cb37-423" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-424"><a href="#cb37-424" aria-hidden="true" tabindex="-1"></a>In other words, the derivative of the function at each point tells us the direction of our next guess.</span>
+<span id="cb37-425"><a href="#cb37-425" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-426"><a href="#cb37-426" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>A negative slope means we want to step to the right, or move in the *positive* direction. </span>
+<span id="cb37-427"><a href="#cb37-427" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>A positive slope means we want to step to the left, or move in the *negative* direction.</span>
+<span id="cb37-428"><a href="#cb37-428" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-429"><a href="#cb37-429" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Algorithm Attempt 1</span></span>
+<span id="cb37-430"><a href="#cb37-430" aria-hidden="true" tabindex="-1"></a>Armed with this knowledge, let's try to see if we can use the derivative to optimize the function.</span>
+<span id="cb37-431"><a href="#cb37-431" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-432"><a href="#cb37-432" aria-hidden="true" tabindex="-1"></a>We start by making some guess for the minimizing value of $x$. Then, we look at the derivative of the function at this value of $x$, and step downhill in the *opposite* direction. We can express our new rule as a recurrence relation:</span>
+<span id="cb37-433"><a href="#cb37-433" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-434"><a href="#cb37-434" aria-hidden="true" tabindex="-1"></a>$$x^{(t+1)} = x^{(t)} - \frac{d}{dx} f(x^{(t)})$$</span>
+<span id="cb37-435"><a href="#cb37-435" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-436"><a href="#cb37-436" aria-hidden="true" tabindex="-1"></a>Translating this statement into English: we obtain **our next guess** for the minimizing value of $x$ at timestep $t+1$ ($x^{(t+1)}$) by taking **our last guess** ($x^{(t)}$) and subtracting the **derivative of the function** at that point ($\frac{d}{dx} f(x^{(t)})$).</span>
+<span id="cb37-437"><a href="#cb37-437" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-438"><a href="#cb37-438" aria-hidden="true" tabindex="-1"></a>A few steps are shown below, where the old step is shown as a transparent point, and the next step taken is the green-filled dot.</span>
+<span id="cb37-439"><a href="#cb37-439" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-440"><a href="#cb37-440" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-441"><a href="#cb37-441" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-442"><a href="#cb37-442" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-443"><a href="#cb37-443" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/grad_descent_1.png" alt='grad_descent_2' width='800'&gt;</span>
+<span id="cb37-444"><a href="#cb37-444" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-445"><a href="#cb37-445" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-446"><a href="#cb37-446" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-447"><a href="#cb37-447" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-448"><a href="#cb37-448" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-449"><a href="#cb37-449" aria-hidden="true" tabindex="-1"></a>Looking pretty good! We do have a problem though – once we arrive close to the minimum value of the function, our guesses "bounce" back and forth past the minimum without ever reaching it.</span>
+<span id="cb37-450"><a href="#cb37-450" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-451"><a href="#cb37-451" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-452"><a href="#cb37-452" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-453"><a href="#cb37-453" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-454"><a href="#cb37-454" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/grad_descent_2.png" alt='grad_descent_2' width='500'&gt;</span>
+<span id="cb37-455"><a href="#cb37-455" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-456"><a href="#cb37-456" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-457"><a href="#cb37-457" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-458"><a href="#cb37-458" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-459"><a href="#cb37-459" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-460"><a href="#cb37-460" aria-hidden="true" tabindex="-1"></a>In other words, each step we take when updating our guess moves us too far. We can address this by decreasing the size of each step. </span>
+<span id="cb37-461"><a href="#cb37-461" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-462"><a href="#cb37-462" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Algorithm Attempt 2</span></span>
+<span id="cb37-463"><a href="#cb37-463" aria-hidden="true" tabindex="-1"></a>Let's update our algorithm to use a **learning rate** (also sometimes called the step size), which controls how far we move with each update. We represent the learning rate with $\alpha$. </span>
+<span id="cb37-464"><a href="#cb37-464" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-465"><a href="#cb37-465" aria-hidden="true" tabindex="-1"></a>$$x^{(t+1)} = x^{(t)} - \alpha \frac{d}{dx} f(x^{(t)})$$</span>
+<span id="cb37-466"><a href="#cb37-466" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-467"><a href="#cb37-467" aria-hidden="true" tabindex="-1"></a>A small $\alpha$ means that we will take small steps; a large $\alpha$ means we will take large steps. When do we stop updating? We stop updating either after a fixed number of updates or after a subsequent update doesn't change much.</span>
+<span id="cb37-468"><a href="#cb37-468" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-469"><a href="#cb37-469" aria-hidden="true" tabindex="-1"></a>Updating our function to use $\alpha=0.3$, our algorithm successfully **converges** (settles on a solution and stops updating significantly, or at all) on the minimum value.</span>
+<span id="cb37-470"><a href="#cb37-470" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-471"><a href="#cb37-471" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-472"><a href="#cb37-472" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-473"><a href="#cb37-473" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-474"><a href="#cb37-474" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/grad_descent_3.png" alt='grad_descent_3' width='500'&gt;</span>
+<span id="cb37-475"><a href="#cb37-475" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-476"><a href="#cb37-476" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-477"><a href="#cb37-477" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-478"><a href="#cb37-478" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-479"><a href="#cb37-479" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-480"><a href="#cb37-480" aria-hidden="true" tabindex="-1"></a><span class="fu">### Convexity</span></span>
+<span id="cb37-481"><a href="#cb37-481" aria-hidden="true" tabindex="-1"></a>In our analysis above, we focused our attention on the global minimum of the loss function. You may be wondering: what about the local minimum that's just to the left? </span>
+<span id="cb37-482"><a href="#cb37-482" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-483"><a href="#cb37-483" aria-hidden="true" tabindex="-1"></a>If we had chosen a different starting guess for $\theta$, or a different value for the learning rate $\alpha$, our algorithm may have gotten "stuck" and converged on the local minimum, rather than on the true optimum value of loss. </span>
+<span id="cb37-484"><a href="#cb37-484" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-485"><a href="#cb37-485" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-486"><a href="#cb37-486" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-487"><a href="#cb37-487" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-488"><a href="#cb37-488" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/local.png" alt='local' width='600'&gt;</span>
+<span id="cb37-489"><a href="#cb37-489" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-490"><a href="#cb37-490" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-491"><a href="#cb37-491" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-492"><a href="#cb37-492" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-493"><a href="#cb37-493" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-494"><a href="#cb37-494" aria-hidden="true" tabindex="-1"></a>If the loss function is **convex**, gradient descent is guaranteed to converge and find the global minimum of the objective function. Formally, a function $f$ is convex if:</span>
+<span id="cb37-495"><a href="#cb37-495" aria-hidden="true" tabindex="-1"></a>$$tf(a) + (1-t)f(b) \geq f(ta + (1-t)b)$$</span>
+<span id="cb37-496"><a href="#cb37-496" aria-hidden="true" tabindex="-1"></a>for all $a, b$ in the domain of $f$ and $t \in <span class="co">[</span><span class="ot">0, 1</span><span class="co">]</span>$.</span>
+<span id="cb37-497"><a href="#cb37-497" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-498"><a href="#cb37-498" aria-hidden="true" tabindex="-1"></a>To put this into words: if you drew a line between any two points on the curve, all values on the curve must be *on or below* the line. Importantly, any local minimum of a convex function is also its global minimum so we avoid the situation where the algorithm converges on some critical point that is not the minimum of the function.</span>
+<span id="cb37-499"><a href="#cb37-499" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-500"><a href="#cb37-500" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-501"><a href="#cb37-501" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-502"><a href="#cb37-502" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-503"><a href="#cb37-503" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/convex.png" alt='convex' width='600'&gt;</span>
+<span id="cb37-504"><a href="#cb37-504" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-505"><a href="#cb37-505" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-506"><a href="#cb37-506" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-507"><a href="#cb37-507" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-508"><a href="#cb37-508" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-509"><a href="#cb37-509" aria-hidden="true" tabindex="-1"></a>In summary, non-convex loss functions can cause problems with optimization. This means that our choice of loss function is a key factor in our modeling process. It turns out that MSE *is* convex, which is a major reason why it is such a popular choice of loss function. Gradient descent is only guaranteed to converge (given enough iterations and an appropriate step size) for convex functions.</span>
+<span id="cb37-510"><a href="#cb37-510" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-511"><a href="#cb37-511" aria-hidden="true" tabindex="-1"></a><span class="fu">### Gradient Descent in 1 Dimension</span></span>
+<span id="cb37-512"><a href="#cb37-512" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-513"><a href="#cb37-513" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; **Terminology clarification**: In past lectures, we have used “loss” to refer to the error incurred on a *single* datapoint. In applications, we usually care more about the average error across *all* datapoints. Going forward, we will take the “model’s loss” to mean the model’s average error across the dataset. This is sometimes also known as the empirical risk, cost function, or objective function. $$L(\theta) = R(\theta) = \frac{1}{n} \sum_{i=1}^{n} L(y, \hat{y})$$</span></span>
+<span id="cb37-514"><a href="#cb37-514" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-515"><a href="#cb37-515" aria-hidden="true" tabindex="-1"></a>In our discussion above, we worked with some arbitrary function $f$. As data scientists, we will almost always work with gradient descent in the context of optimizing *models* – specifically, we want to apply gradient descent to find the minimum of a *loss function*. In a modeling context, our goal is to minimize a loss function by choosing the minimizing model *parameters*.</span>
+<span id="cb37-516"><a href="#cb37-516" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-517"><a href="#cb37-517" aria-hidden="true" tabindex="-1"></a>Recall our modeling workflow from the past few lectures: </span>
+<span id="cb37-518"><a href="#cb37-518" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-519"><a href="#cb37-519" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Define a model with some parameters $\theta_i$</span>
+<span id="cb37-520"><a href="#cb37-520" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Choose a loss function </span>
+<span id="cb37-521"><a href="#cb37-521" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Select the values of $\theta_i$ that minimize the loss function on the data</span>
+<span id="cb37-522"><a href="#cb37-522" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-523"><a href="#cb37-523" aria-hidden="true" tabindex="-1"></a>Gradient descent is a powerful technique for completing this last task. By applying the gradient descent algorithm, we can select values for our parameters $\theta_i$ that will lead to the model having minimal loss on the training data.</span>
+<span id="cb37-524"><a href="#cb37-524" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-525"><a href="#cb37-525" aria-hidden="true" tabindex="-1"></a>When using gradient descent in a modeling context, we:</span>
+<span id="cb37-526"><a href="#cb37-526" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-527"><a href="#cb37-527" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Make guesses for the minimizing $\theta_i$</span>
+<span id="cb37-528"><a href="#cb37-528" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Compute the derivative of the loss function $L$</span>
+<span id="cb37-529"><a href="#cb37-529" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-530"><a href="#cb37-530" aria-hidden="true" tabindex="-1"></a>We can "translate" our gradient descent rule from before by replacing $x$ with $\theta$ and $f$ with $L$:</span>
+<span id="cb37-531"><a href="#cb37-531" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-532"><a href="#cb37-532" aria-hidden="true" tabindex="-1"></a>$$\theta^{(t+1)} = \theta^{(t)} - \alpha \frac{d}{d\theta} L(\theta^{(t)})$$</span>
+<span id="cb37-533"><a href="#cb37-533" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-534"><a href="#cb37-534" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Gradient Descent on the `tips` Dataset </span></span>
+<span id="cb37-535"><a href="#cb37-535" aria-hidden="true" tabindex="-1"></a>To see this in action, let's consider a case where we have a linear model with no offset. We want to predict the tip (y) given the price of a meal (x). To do this, we</span>
+<span id="cb37-536"><a href="#cb37-536" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-537"><a href="#cb37-537" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Choose a model: $\hat{y} = \theta_1 x$,</span>
+<span id="cb37-538"><a href="#cb37-538" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Choose a loss function: $L(\theta) = MSE(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \theta_1x_i)^2$.</span>
+<span id="cb37-539"><a href="#cb37-539" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-540"><a href="#cb37-540" aria-hidden="true" tabindex="-1"></a>Let's apply our <span class="in">`gradient_descent`</span> function from before to optimize our model on the <span class="in">`tips`</span> dataset. We will try to select the best parameter $\theta_i$ to predict the <span class="in">`tip`</span> $y$ from the <span class="in">`total_bill`</span> $x$.</span>
+<span id="cb37-541"><a href="#cb37-541" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-544"><a href="#cb37-544" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-545"><a href="#cb37-545" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-546"><a href="#cb37-546" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> sns.load_dataset(<span class="st">"tips"</span>)</span>
+<span id="cb37-547"><a href="#cb37-547" aria-hidden="true" tabindex="-1"></a>df.head()</span>
+<span id="cb37-548"><a href="#cb37-548" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-549"><a href="#cb37-549" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-550"><a href="#cb37-550" aria-hidden="true" tabindex="-1"></a>We can visualize the value of the MSE on our dataset for different possible choices of $\theta_1$. To optimize our model, we want to select the value of $\theta_1$ that leads to the lowest MSE.</span>
+<span id="cb37-551"><a href="#cb37-551" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-554"><a href="#cb37-554" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-555"><a href="#cb37-555" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb37-556"><a href="#cb37-556" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-557"><a href="#cb37-557" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.graph_objects <span class="im">as</span> go</span>
+<span id="cb37-558"><a href="#cb37-558" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-559"><a href="#cb37-559" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> derivative_arbitrary(x):</span>
+<span id="cb37-560"><a href="#cb37-560" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (<span class="dv">4</span><span class="op">*</span>x<span class="op">**</span><span class="dv">3</span> <span class="op">-</span> <span class="dv">45</span><span class="op">*</span>x<span class="op">**</span><span class="dv">2</span> <span class="op">+</span> <span class="dv">160</span><span class="op">*</span>x <span class="op">-</span> <span class="dv">180</span>)<span class="op">/</span><span class="dv">10</span></span>
+<span id="cb37-561"><a href="#cb37-561" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-562"><a href="#cb37-562" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> go.Figure()</span>
+<span id="cb37-563"><a href="#cb37-563" aria-hidden="true" tabindex="-1"></a>roots <span class="op">=</span> np.array([<span class="fl">2.3927</span>, <span class="fl">3.5309</span>, <span class="fl">5.3263</span>])</span>
+<span id="cb37-564"><a href="#cb37-564" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-565"><a href="#cb37-565" aria-hidden="true" tabindex="-1"></a>fig.add_trace(go.Scatter(x <span class="op">=</span> xs, y <span class="op">=</span> arbitrary(xs), </span>
+<span id="cb37-566"><a href="#cb37-566" aria-hidden="true" tabindex="-1"></a>                         mode <span class="op">=</span> <span class="st">"lines"</span>, name <span class="op">=</span> <span class="st">"f"</span>))</span>
+<span id="cb37-567"><a href="#cb37-567" aria-hidden="true" tabindex="-1"></a>fig.add_trace(go.Scatter(x <span class="op">=</span> xs, y <span class="op">=</span> derivative_arbitrary(xs), </span>
+<span id="cb37-568"><a href="#cb37-568" aria-hidden="true" tabindex="-1"></a>                         mode <span class="op">=</span> <span class="st">"lines"</span>, name <span class="op">=</span> <span class="st">"df"</span>, line <span class="op">=</span> {<span class="st">"dash"</span>: <span class="st">"dash"</span>}))</span>
+<span id="cb37-569"><a href="#cb37-569" aria-hidden="true" tabindex="-1"></a>fig.add_trace(go.Scatter(x <span class="op">=</span> np.array(roots), y <span class="op">=</span> <span class="dv">0</span><span class="op">*</span>roots, </span>
+<span id="cb37-570"><a href="#cb37-570" aria-hidden="true" tabindex="-1"></a>                         mode <span class="op">=</span> <span class="st">"markers"</span>, name <span class="op">=</span> <span class="st">"df = zero"</span>, marker_size <span class="op">=</span> <span class="dv">12</span>))</span>
+<span id="cb37-571"><a href="#cb37-571" aria-hidden="true" tabindex="-1"></a>fig.update_layout(font_size <span class="op">=</span> <span class="dv">20</span>, yaxis_range<span class="op">=</span>[<span class="op">-</span><span class="dv">1</span>, <span class="dv">3</span>])</span>
+<span id="cb37-572"><a href="#cb37-572" aria-hidden="true" tabindex="-1"></a>fig.update_layout(autosize<span class="op">=</span><span class="va">False</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>)</span>
+<span id="cb37-573"><a href="#cb37-573" aria-hidden="true" tabindex="-1"></a>fig.show()</span>
+<span id="cb37-574"><a href="#cb37-574" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-575"><a href="#cb37-575" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-576"><a href="#cb37-576" aria-hidden="true" tabindex="-1"></a>To apply gradient descent, we need to compute the derivative of the loss function with respect to our parameter $\theta_1$.</span>
+<span id="cb37-577"><a href="#cb37-577" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-578"><a href="#cb37-578" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Given our loss function, $$L(\theta) = MSE(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \theta_1x_i)^2$$</span>
+<span id="cb37-579"><a href="#cb37-579" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>We take the derivative with respect to $\theta_1$ $$\frac{\partial}{\partial \theta_{1}} L(\theta_1^{(t)}) = \frac{-2}{n} \sum_{i=1}^n (y_i - \theta_1^{(t)} x_i) x_i$$</span>
+<span id="cb37-580"><a href="#cb37-580" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Which results in the gradient descent update rule</span>
+<span id="cb37-581"><a href="#cb37-581" aria-hidden="true" tabindex="-1"></a>$$\theta_1^{(t+1)} = \theta_1^{(t)} - \alpha \frac{d}{d\theta}L(\theta_1^{(t)})$$</span>
+<span id="cb37-582"><a href="#cb37-582" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-583"><a href="#cb37-583" aria-hidden="true" tabindex="-1"></a>for some learning rate $\alpha$.</span>
+<span id="cb37-584"><a href="#cb37-584" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-585"><a href="#cb37-585" aria-hidden="true" tabindex="-1"></a>Implementing this in code, we can visualize the MSE loss on the <span class="in">`tips`</span> data. **MSE is convex**, so there is one global minimum.</span>
+<span id="cb37-586"><a href="#cb37-586" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-589"><a href="#cb37-589" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-590"><a href="#cb37-590" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb37-591"><a href="#cb37-591" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-592"><a href="#cb37-592" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gradient_descent(df, initial_guess, alpha, n):</span>
+<span id="cb37-593"><a href="#cb37-593" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""Performs n steps of gradient descent on df using learning rate alpha starting</span></span>
+<span id="cb37-594"><a href="#cb37-594" aria-hidden="true" tabindex="-1"></a><span class="co">       from initial_guess. Returns a numpy array of all guesses over time."""</span></span>
+<span id="cb37-595"><a href="#cb37-595" aria-hidden="true" tabindex="-1"></a>    guesses <span class="op">=</span> [initial_guess]</span>
+<span id="cb37-596"><a href="#cb37-596" aria-hidden="true" tabindex="-1"></a>    current_guess <span class="op">=</span> initial_guess</span>
+<span id="cb37-597"><a href="#cb37-597" aria-hidden="true" tabindex="-1"></a>    <span class="cf">while</span> <span class="bu">len</span>(guesses) <span class="op">&lt;</span> n:</span>
+<span id="cb37-598"><a href="#cb37-598" aria-hidden="true" tabindex="-1"></a>        current_guess <span class="op">=</span> current_guess <span class="op">-</span> alpha <span class="op">*</span> df(current_guess)</span>
+<span id="cb37-599"><a href="#cb37-599" aria-hidden="true" tabindex="-1"></a>        guesses.append(current_guess)</span>
+<span id="cb37-600"><a href="#cb37-600" aria-hidden="true" tabindex="-1"></a>        </span>
+<span id="cb37-601"><a href="#cb37-601" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.array(guesses)</span>
+<span id="cb37-602"><a href="#cb37-602" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-603"><a href="#cb37-603" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_single_arg(theta_1):</span>
+<span id="cb37-604"><a href="#cb37-604" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""Returns the MSE on our data for the given theta1"""</span></span>
+<span id="cb37-605"><a href="#cb37-605" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> df[<span class="st">"total_bill"</span>]</span>
+<span id="cb37-606"><a href="#cb37-606" aria-hidden="true" tabindex="-1"></a>    y_obs <span class="op">=</span> df[<span class="st">"tip"</span>]</span>
+<span id="cb37-607"><a href="#cb37-607" aria-hidden="true" tabindex="-1"></a>    y_hat <span class="op">=</span> theta_1 <span class="op">*</span> x</span>
+<span id="cb37-608"><a href="#cb37-608" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean((y_hat <span class="op">-</span> y_obs) <span class="op">**</span> <span class="dv">2</span>)</span>
+<span id="cb37-609"><a href="#cb37-609" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-610"><a href="#cb37-610" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_loss_derivative_single_arg(theta_1):</span>
+<span id="cb37-611"><a href="#cb37-611" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""Returns the derivative of the MSE on our data for the given theta1"""</span></span>
+<span id="cb37-612"><a href="#cb37-612" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> df[<span class="st">"total_bill"</span>]</span>
+<span id="cb37-613"><a href="#cb37-613" aria-hidden="true" tabindex="-1"></a>    y_obs <span class="op">=</span> df[<span class="st">"tip"</span>]</span>
+<span id="cb37-614"><a href="#cb37-614" aria-hidden="true" tabindex="-1"></a>    y_hat <span class="op">=</span> theta_1 <span class="op">*</span> x</span>
+<span id="cb37-615"><a href="#cb37-615" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb37-616"><a href="#cb37-616" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(<span class="dv">2</span> <span class="op">*</span> (y_hat <span class="op">-</span> y_obs) <span class="op">*</span> x)</span>
+<span id="cb37-617"><a href="#cb37-617" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-618"><a href="#cb37-618" aria-hidden="true" tabindex="-1"></a>loss_df <span class="op">=</span> pd.DataFrame({<span class="st">"theta_1"</span>:np.linspace(<span class="op">-</span><span class="fl">1.5</span>, <span class="dv">1</span>), <span class="st">"MSE"</span>:[mse_single_arg(theta_1) <span class="cf">for</span> theta_1 <span class="kw">in</span> np.linspace(<span class="op">-</span><span class="fl">1.5</span>, <span class="dv">1</span>)]})</span>
+<span id="cb37-619"><a href="#cb37-619" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-620"><a href="#cb37-620" aria-hidden="true" tabindex="-1"></a>trajectory <span class="op">=</span> gradient_descent(mse_loss_derivative_single_arg, <span class="op">-</span><span class="fl">0.5</span>, <span class="fl">0.0001</span>, <span class="dv">100</span>)</span>
+<span id="cb37-621"><a href="#cb37-621" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-622"><a href="#cb37-622" aria-hidden="true" tabindex="-1"></a>plt.plot(loss_df[<span class="st">"theta_1"</span>], loss_df[<span class="st">"MSE"</span>])</span>
+<span id="cb37-623"><a href="#cb37-623" aria-hidden="true" tabindex="-1"></a>plt.scatter(trajectory, [mse_single_arg(guess) <span class="cf">for</span> guess <span class="kw">in</span> trajectory], c<span class="op">=</span><span class="st">"white"</span>, edgecolor<span class="op">=</span><span class="st">"firebrick"</span>)</span>
+<span id="cb37-624"><a href="#cb37-624" aria-hidden="true" tabindex="-1"></a>plt.scatter(trajectory[<span class="op">-</span><span class="dv">1</span>], mse_single_arg(trajectory[<span class="op">-</span><span class="dv">1</span>]), c<span class="op">=</span><span class="st">"firebrick"</span>)</span>
+<span id="cb37-625"><a href="#cb37-625" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r"$\theta_1$"</span>)</span>
+<span id="cb37-626"><a href="#cb37-626" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="vs">r"$L(\theta_1)$"</span>)<span class="op">;</span></span>
+<span id="cb37-627"><a href="#cb37-627" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-628"><a href="#cb37-628" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Final guess for theta_1: </span><span class="sc">{</span>trajectory[<span class="op">-</span><span class="dv">1</span>]<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb37-629"><a href="#cb37-629" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-630"><a href="#cb37-630" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-631"><a href="#cb37-631" aria-hidden="true" tabindex="-1"></a><span class="fu">### Gradient Descent on Multi-Dimensional Models</span></span>
+<span id="cb37-632"><a href="#cb37-632" aria-hidden="true" tabindex="-1"></a>The function we worked with above was one-dimensional – we were only minimizing the function with respect to a single parameter, $\theta$. However, models usually have a cost function with multiple parameters that need to be optimized. For example, simple linear regression has 2 parameters: </span>
+<span id="cb37-633"><a href="#cb37-633" aria-hidden="true" tabindex="-1"></a>$$\hat{y} + \theta_0 + \theta_1x$$ </span>
+<span id="cb37-634"><a href="#cb37-634" aria-hidden="true" tabindex="-1"></a>and multiple linear regression has $p+1$ parameters: </span>
+<span id="cb37-635"><a href="#cb37-635" aria-hidden="true" tabindex="-1"></a>$$\mathbb{Y} = \theta_0 + \theta_1 \Bbb{X}_{:,1} + \theta_2 \Bbb{X}_{:,2} + \cdots + \theta_p \Bbb{X}_{:,p}$$</span>
+<span id="cb37-636"><a href="#cb37-636" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-637"><a href="#cb37-637" aria-hidden="true" tabindex="-1"></a>We'll need to expand gradient descent so we can update our guesses for all model parameters all in one go.</span>
+<span id="cb37-638"><a href="#cb37-638" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-639"><a href="#cb37-639" aria-hidden="true" tabindex="-1"></a>With multiple parameters to optimize, we consider a **loss surface**, or the model's loss for a particular *combination* of possible parameter values.</span>
+<span id="cb37-640"><a href="#cb37-640" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-643"><a href="#cb37-643" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-644"><a href="#cb37-644" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb37-645"><a href="#cb37-645" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-646"><a href="#cb37-646" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.graph_objects <span class="im">as</span> go</span>
+<span id="cb37-647"><a href="#cb37-647" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-648"><a href="#cb37-648" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-649"><a href="#cb37-649" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_loss(theta, X, y_obs):</span>
+<span id="cb37-650"><a href="#cb37-650" aria-hidden="true" tabindex="-1"></a>    y_hat <span class="op">=</span> X <span class="op">@</span> theta</span>
+<span id="cb37-651"><a href="#cb37-651" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean((y_hat <span class="op">-</span> y_obs) <span class="op">**</span> <span class="dv">2</span>)    </span>
+<span id="cb37-652"><a href="#cb37-652" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-653"><a href="#cb37-653" aria-hidden="true" tabindex="-1"></a>tips_with_bias <span class="op">=</span> df.copy()</span>
+<span id="cb37-654"><a href="#cb37-654" aria-hidden="true" tabindex="-1"></a>tips_with_bias[<span class="st">"bias"</span>] <span class="op">=</span> <span class="dv">1</span></span>
+<span id="cb37-655"><a href="#cb37-655" aria-hidden="true" tabindex="-1"></a>tips_with_bias <span class="op">=</span> tips_with_bias[[<span class="st">"bias"</span>, <span class="st">"total_bill"</span>]]</span>
+<span id="cb37-656"><a href="#cb37-656" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-657"><a href="#cb37-657" aria-hidden="true" tabindex="-1"></a>uvalues <span class="op">=</span> np.linspace(<span class="dv">0</span>, <span class="dv">2</span>, <span class="dv">10</span>)</span>
+<span id="cb37-658"><a href="#cb37-658" aria-hidden="true" tabindex="-1"></a>vvalues <span class="op">=</span> np.linspace(<span class="op">-</span><span class="fl">0.1</span>, <span class="fl">0.35</span>, <span class="dv">10</span>)</span>
+<span id="cb37-659"><a href="#cb37-659" aria-hidden="true" tabindex="-1"></a>(u,v) <span class="op">=</span> np.meshgrid(uvalues, vvalues)</span>
+<span id="cb37-660"><a href="#cb37-660" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> np.vstack((u.flatten(),v.flatten()))</span>
+<span id="cb37-661"><a href="#cb37-661" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-662"><a href="#cb37-662" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_loss_single_arg(theta):</span>
+<span id="cb37-663"><a href="#cb37-663" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> mse_loss(theta, tips_with_bias, df[<span class="st">"tip"</span>])</span>
+<span id="cb37-664"><a href="#cb37-664" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-665"><a href="#cb37-665" aria-hidden="true" tabindex="-1"></a>MSE <span class="op">=</span> np.array([mse_loss_single_arg(t) <span class="cf">for</span> t <span class="kw">in</span> thetas.T])</span>
+<span id="cb37-666"><a href="#cb37-666" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-667"><a href="#cb37-667" aria-hidden="true" tabindex="-1"></a>loss_surface <span class="op">=</span> go.Surface(x<span class="op">=</span>u, y<span class="op">=</span>v, z<span class="op">=</span>np.reshape(MSE, u.shape))</span>
+<span id="cb37-668"><a href="#cb37-668" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-669"><a href="#cb37-669" aria-hidden="true" tabindex="-1"></a>ind <span class="op">=</span> np.argmin(MSE)</span>
+<span id="cb37-670"><a href="#cb37-670" aria-hidden="true" tabindex="-1"></a>optimal_point <span class="op">=</span> go.Scatter3d(name <span class="op">=</span> <span class="st">"Optimal Point"</span>,</span>
+<span id="cb37-671"><a href="#cb37-671" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> [thetas.T[ind,<span class="dv">0</span>]], y <span class="op">=</span> [thetas.T[ind,<span class="dv">1</span>]], </span>
+<span id="cb37-672"><a href="#cb37-672" aria-hidden="true" tabindex="-1"></a>    z <span class="op">=</span> [MSE[ind]],</span>
+<span id="cb37-673"><a href="#cb37-673" aria-hidden="true" tabindex="-1"></a>    marker<span class="op">=</span><span class="bu">dict</span>(size<span class="op">=</span><span class="dv">10</span>, color<span class="op">=</span><span class="st">"red"</span>))</span>
+<span id="cb37-674"><a href="#cb37-674" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-675"><a href="#cb37-675" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> go.Figure(data<span class="op">=</span>[loss_surface, optimal_point])</span>
+<span id="cb37-676"><a href="#cb37-676" aria-hidden="true" tabindex="-1"></a>fig.update_layout(scene <span class="op">=</span> <span class="bu">dict</span>(</span>
+<span id="cb37-677"><a href="#cb37-677" aria-hidden="true" tabindex="-1"></a>    xaxis_title <span class="op">=</span> <span class="st">"theta0"</span>,</span>
+<span id="cb37-678"><a href="#cb37-678" aria-hidden="true" tabindex="-1"></a>    yaxis_title <span class="op">=</span> <span class="st">"theta1"</span>,</span>
+<span id="cb37-679"><a href="#cb37-679" aria-hidden="true" tabindex="-1"></a>    zaxis_title <span class="op">=</span> <span class="st">"MSE"</span>), autosize<span class="op">=</span><span class="va">False</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>)</span>
+<span id="cb37-680"><a href="#cb37-680" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-681"><a href="#cb37-681" aria-hidden="true" tabindex="-1"></a>fig.show()</span>
+<span id="cb37-682"><a href="#cb37-682" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-683"><a href="#cb37-683" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-684"><a href="#cb37-684" aria-hidden="true" tabindex="-1"></a>We can also visualize a bird's-eye view of the loss surface from above using a contour plot:</span>
+<span id="cb37-685"><a href="#cb37-685" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-688"><a href="#cb37-688" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb37-689"><a href="#cb37-689" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb37-690"><a href="#cb37-690" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb37-691"><a href="#cb37-691" aria-hidden="true" tabindex="-1"></a>contour <span class="op">=</span> go.Contour(x<span class="op">=</span>u[<span class="dv">0</span>], y<span class="op">=</span>v[:, <span class="dv">0</span>], z<span class="op">=</span>np.reshape(MSE, u.shape))</span>
+<span id="cb37-692"><a href="#cb37-692" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> go.Figure(contour)</span>
+<span id="cb37-693"><a href="#cb37-693" aria-hidden="true" tabindex="-1"></a>fig.update_layout(</span>
+<span id="cb37-694"><a href="#cb37-694" aria-hidden="true" tabindex="-1"></a>    xaxis_title <span class="op">=</span> <span class="st">"theta0"</span>,</span>
+<span id="cb37-695"><a href="#cb37-695" aria-hidden="true" tabindex="-1"></a>    yaxis_title <span class="op">=</span> <span class="st">"theta1"</span>, autosize<span class="op">=</span><span class="va">False</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>)</span>
+<span id="cb37-696"><a href="#cb37-696" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-697"><a href="#cb37-697" aria-hidden="true" tabindex="-1"></a>fig.show()</span>
+<span id="cb37-698"><a href="#cb37-698" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb37-699"><a href="#cb37-699" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-700"><a href="#cb37-700" aria-hidden="true" tabindex="-1"></a><span class="fu">#### The Gradient Vector</span></span>
+<span id="cb37-701"><a href="#cb37-701" aria-hidden="true" tabindex="-1"></a>As before, the derivative of the loss function tells us the best way towards the minimum value.</span>
+<span id="cb37-702"><a href="#cb37-702" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-703"><a href="#cb37-703" aria-hidden="true" tabindex="-1"></a>On a 2D (or higher) surface, the best way to go down (gradient) is described by a *vector*.</span>
+<span id="cb37-704"><a href="#cb37-704" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-705"><a href="#cb37-705" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-706"><a href="#cb37-706" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-707"><a href="#cb37-707" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-708"><a href="#cb37-708" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/loss_surface.png" alt='loss_surface' width='600'&gt;</span>
+<span id="cb37-709"><a href="#cb37-709" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-710"><a href="#cb37-710" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-711"><a href="#cb37-711" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-712"><a href="#cb37-712" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-713"><a href="#cb37-713" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-714"><a href="#cb37-714" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; Math Aside: Partial Derivatives </span></span>
+<span id="cb37-715"><a href="#cb37-715" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-716"><a href="#cb37-716" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; - For an equation with multiple variables, we take a **partial derivative** by differentiating with respect to just one variable at a time. The partial derivative is denoted with a $\partial$. Intuitively, we want to see how the function changes if we only vary one variable while holding other variables constant. </span></span>
+<span id="cb37-717"><a href="#cb37-717" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; - Using $f(x, y) = 3x^2 + y$ as an example,</span></span>
+<span id="cb37-718"><a href="#cb37-718" aria-hidden="true" tabindex="-1"></a><span class="at">&gt;   - taking the partial derivative with respect to x and treating y as a constant gives us $\frac{\partial f}{\partial x} = 6x$</span></span>
+<span id="cb37-719"><a href="#cb37-719" aria-hidden="true" tabindex="-1"></a><span class="at">&gt;   - taking the partial derivative with respect to y and treating x as a constant gives us $\frac{\partial f}{\partial y} = 1$</span></span>
+<span id="cb37-720"><a href="#cb37-720" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-721"><a href="#cb37-721" aria-hidden="true" tabindex="-1"></a>For the *vector* of parameter values $\vec{\theta} = \begin{bmatrix}</span>
+<span id="cb37-722"><a href="#cb37-722" aria-hidden="true" tabindex="-1"></a>           \theta_{0} <span class="sc">\\</span></span>
+<span id="cb37-723"><a href="#cb37-723" aria-hidden="true" tabindex="-1"></a>           \theta_{1} <span class="sc">\\</span></span>
+<span id="cb37-724"><a href="#cb37-724" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix}$, we take the *partial derivative* of loss with respect to each parameter: $\frac{\partial L}{\partial \theta_0}$ and $\frac{\partial L}{\partial \theta_1}$.</span>
+<span id="cb37-725"><a href="#cb37-725" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-726"><a href="#cb37-726" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; For example, consider the 2D function: $$f(\theta_0, \theta_1) = 8 \theta_0^2 + 3\theta_0\theta_1$$</span></span>
+<span id="cb37-727"><a href="#cb37-727" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; For a function of 2 variables $f(\theta_0, \theta_1)$, we define the gradient </span></span>
+<span id="cb37-728"><a href="#cb37-728" aria-hidden="true" tabindex="-1"></a><span class="at">$$</span></span>
+<span id="cb37-729"><a href="#cb37-729" aria-hidden="true" tabindex="-1"></a><span class="at">\begin{align}</span></span>
+<span id="cb37-730"><a href="#cb37-730" aria-hidden="true" tabindex="-1"></a><span class="at">\frac{\partial f}{\partial \theta_{0}} &amp;= 16\theta_0 + 3\theta_1 </span><span class="sc">\\</span></span>
+<span id="cb37-731"><a href="#cb37-731" aria-hidden="true" tabindex="-1"></a><span class="at">\frac{\partial f}{\partial \theta_{1}} &amp;= 3\theta_0 </span><span class="sc">\\</span></span>
+<span id="cb37-732"><a href="#cb37-732" aria-hidden="true" tabindex="-1"></a><span class="at">\nabla_{\vec{\theta}} f(\vec{\theta}) &amp;=  \begin{bmatrix} 16\theta_0 + 3\theta_1 </span><span class="sc">\\</span><span class="at"> 3\theta_0 </span><span class="sc">\\</span><span class="at"> \end{bmatrix}</span></span>
+<span id="cb37-733"><a href="#cb37-733" aria-hidden="true" tabindex="-1"></a><span class="at">\end{align}</span></span>
+<span id="cb37-734"><a href="#cb37-734" aria-hidden="true" tabindex="-1"></a><span class="at">$$</span></span>
+<span id="cb37-735"><a href="#cb37-735" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-736"><a href="#cb37-736" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-737"><a href="#cb37-737" aria-hidden="true" tabindex="-1"></a>The **gradient vector** of a generic function of $p+1$ variables is therefore </span>
+<span id="cb37-738"><a href="#cb37-738" aria-hidden="true" tabindex="-1"></a>$$\nabla_{\vec{\theta}} L =  \begin{bmatrix} \frac{\partial L}{\partial \theta_0} <span class="sc">\\</span> \frac{\partial L}{\partial \theta_1} <span class="sc">\\</span> \vdots \end{bmatrix}$$</span>
+<span id="cb37-739"><a href="#cb37-739" aria-hidden="true" tabindex="-1"></a>where $\nabla_\theta L$ always points in the downhill direction of the surface. We can interpret each gradient as: "If I nudge the $i$th model weight, what happens to loss?"</span>
+<span id="cb37-740"><a href="#cb37-740" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-741"><a href="#cb37-741" aria-hidden="true" tabindex="-1"></a>We can use this to update our 1D gradient rule for models with multiple parameters. </span>
+<span id="cb37-742"><a href="#cb37-742" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-743"><a href="#cb37-743" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Recall our 1D update rule: $$\theta^{(t+1)} = \theta^{(t)} - \alpha \frac{d}{d\theta}L(\theta^{(t)})$$ </span>
+<span id="cb37-744"><a href="#cb37-744" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>For models with multiple parameters, we work in terms of vectors:</span>
+<span id="cb37-745"><a href="#cb37-745" aria-hidden="true" tabindex="-1"></a>$$\begin{bmatrix}</span>
+<span id="cb37-746"><a href="#cb37-746" aria-hidden="true" tabindex="-1"></a>           \theta_{0}^{(t+1)} <span class="sc">\\</span></span>
+<span id="cb37-747"><a href="#cb37-747" aria-hidden="true" tabindex="-1"></a>           \theta_{1}^{(t+1)} <span class="sc">\\</span></span>
+<span id="cb37-748"><a href="#cb37-748" aria-hidden="true" tabindex="-1"></a>           \vdots</span>
+<span id="cb37-749"><a href="#cb37-749" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix} = \begin{bmatrix}</span>
+<span id="cb37-750"><a href="#cb37-750" aria-hidden="true" tabindex="-1"></a>           \theta_{0}^{(t)} <span class="sc">\\</span></span>
+<span id="cb37-751"><a href="#cb37-751" aria-hidden="true" tabindex="-1"></a>           \theta_{1}^{(t)} <span class="sc">\\</span></span>
+<span id="cb37-752"><a href="#cb37-752" aria-hidden="true" tabindex="-1"></a>           \vdots</span>
+<span id="cb37-753"><a href="#cb37-753" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix} - \alpha \begin{bmatrix}</span>
+<span id="cb37-754"><a href="#cb37-754" aria-hidden="true" tabindex="-1"></a>           \frac{\partial L}{\partial \theta_{0}} <span class="sc">\\</span></span>
+<span id="cb37-755"><a href="#cb37-755" aria-hidden="true" tabindex="-1"></a>           \frac{\partial L}{\partial \theta_{1}} <span class="sc">\\</span></span>
+<span id="cb37-756"><a href="#cb37-756" aria-hidden="true" tabindex="-1"></a>           \vdots <span class="sc">\\</span></span>
+<span id="cb37-757"><a href="#cb37-757" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix}$$</span>
+<span id="cb37-758"><a href="#cb37-758" aria-hidden="true" tabindex="-1"></a>  </span>
+<span id="cb37-759"><a href="#cb37-759" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Written in a more compact form, $$\vec{\theta}^{(t+1)} = \vec{\theta}^{(t)} - \alpha \nabla_{\vec{\theta}} L(\theta^{(t)}) $$</span>
+<span id="cb37-760"><a href="#cb37-760" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-761"><a href="#cb37-761" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\theta$ is a vector with our model weights</span>
+<span id="cb37-762"><a href="#cb37-762" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$L$ is the loss function</span>
+<span id="cb37-763"><a href="#cb37-763" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\alpha$ is the learning rate (ours is constant, but other techniques use an $\alpha$ that decreases over time)</span>
+<span id="cb37-764"><a href="#cb37-764" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\vec{\theta}^{(t)}$ is the current value of $\theta$</span>
+<span id="cb37-765"><a href="#cb37-765" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\vec{\theta}^{(t+1)}$ is the next value of $\theta$</span>
+<span id="cb37-766"><a href="#cb37-766" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\nabla_{\vec{\theta}} L(\theta^{(t)})$ is the gradient of the loss function evaluated at the current $\vec{\theta}^{(t)}$</span>
+<span id="cb37-767"><a href="#cb37-767" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-768"><a href="#cb37-768" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-769"><a href="#cb37-769" aria-hidden="true" tabindex="-1"></a><span class="fu">### Batch Gradient Descent and Stochastic Gradient Descent</span></span>
+<span id="cb37-770"><a href="#cb37-770" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-771"><a href="#cb37-771" aria-hidden="true" tabindex="-1"></a>Formally, the algorithm we derived above is called **batch gradient descent.** For each iteration of the algorithm, the derivative of loss is computed across the *entire* batch of all $n$ datapoints. While this update rule works well in theory, it is not practical in most circumstances. For large datasets (with perhaps billions of datapoints), finding the gradient across all the data is incredibly computationally taxing; gradient descent will converge slowly because each individual update is slow.</span>
+<span id="cb37-772"><a href="#cb37-772" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-773"><a href="#cb37-773" aria-hidden="true" tabindex="-1"></a>**Stochastic (mini-batch) gradient descent** tries to address this issue. In stochastic descent, only a *sample* of the full dataset is used at each update. We estimate the true gradient of the loss surface using just that sample of data. The **batch size** is the number of data points used in each sample. The sampling strategy is generally without replacement (data is shuffled and batch size examples are selected one at a time.)</span>
+<span id="cb37-774"><a href="#cb37-774" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-775"><a href="#cb37-775" aria-hidden="true" tabindex="-1"></a>Each complete "pass" through the data is known as a **training epoch**. After shuffling the data, in a single **training epoch** of stochastic gradient descent, we</span>
+<span id="cb37-776"><a href="#cb37-776" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-777"><a href="#cb37-777" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Compute the gradient on the first x% of the data. Update the parameter guesses.</span>
+<span id="cb37-778"><a href="#cb37-778" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Compute the gradient on the next x% of the data. Update the parameter guesses.</span>
+<span id="cb37-779"><a href="#cb37-779" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\dots$</span>
+<span id="cb37-780"><a href="#cb37-780" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Compute the gradient on the last x% of the data. Update the parameter guesses.</span>
+<span id="cb37-781"><a href="#cb37-781" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-782"><a href="#cb37-782" aria-hidden="true" tabindex="-1"></a>Every data point appears once in a single training epoch. We then perform several training epochs until we're satisfied.</span>
+<span id="cb37-783"><a href="#cb37-783" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-784"><a href="#cb37-784" aria-hidden="true" tabindex="-1"></a>Batch gradient descent is a deterministic technique – because the entire dataset is used at each update iteration, the algorithm will always advance towards the minimum of the loss surface. In contrast, stochastic gradient descent involve an element of randomness. Since only a subset of the full data is used to update the guess for $\vec{\theta}$ at each iteration, there's a chance the algorithm will not progress towards the true minimum of loss with each update. Over the longer term, these stochastic techniques should still converge towards the optimal solution. </span>
+<span id="cb37-785"><a href="#cb37-785" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-786"><a href="#cb37-786" aria-hidden="true" tabindex="-1"></a>The diagrams below represent a "bird's eye view" of a loss surface from above. Notice that batch gradient descent takes a direct path towards the optimal $\hat{\theta}$. Stochastic gradient descent, in contrast, "hops around" on its path to the minimum point on the loss surface. This reflects the randomness of the sampling process at each update step.</span>
+<span id="cb37-787"><a href="#cb37-787" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-788"><a href="#cb37-788" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb37-789"><a href="#cb37-789" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb37-790"><a href="#cb37-790" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb37-791"><a href="#cb37-791" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/stochastic.png" alt='stochastic' width='600'&gt;</span>
+<span id="cb37-792"><a href="#cb37-792" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb37-793"><a href="#cb37-793" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb37-794"><a href="#cb37-794" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb37-795"><a href="#cb37-795" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb37-796"><a href="#cb37-796" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-797"><a href="#cb37-797" aria-hidden="true" tabindex="-1"></a>To summarize the tradeoffs of batch size: </span>
+<span id="cb37-798"><a href="#cb37-798" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-799"><a href="#cb37-799" aria-hidden="true" tabindex="-1"></a>| - | Smaller Batch Size | Larger Batch Size | </span>
+<span id="cb37-800"><a href="#cb37-800" aria-hidden="true" tabindex="-1"></a>| -- | -- | -- | </span>
+<span id="cb37-801"><a href="#cb37-801" aria-hidden="true" tabindex="-1"></a>| Pros | More frequent gradient updates | Leverage hardware acceleration to improve overall system performance and higher quality gradient updates | </span>
+<span id="cb37-802"><a href="#cb37-802" aria-hidden="true" tabindex="-1"></a>| Cons | More variability in the gradient estimates | Less frequent gradient updates |</span>
+<span id="cb37-803"><a href="#cb37-803" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-804"><a href="#cb37-804" aria-hidden="true" tabindex="-1"></a>The typical solution is to set batch size to ensure sufficient hardware utilization.</span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/gradient_descent/gradient_descent_files/figure-html/cell-21-output-2.png b/docs/gradient_descent/gradient_descent_files/figure-html/cell-21-output-2.png
new file mode 100644
index 000000000..6d65a353e
Binary files /dev/null and b/docs/gradient_descent/gradient_descent_files/figure-html/cell-21-output-2.png differ
diff --git a/docs/gradient_descent/images/arbitrary.png b/docs/gradient_descent/images/arbitrary.png
new file mode 100644
index 000000000..06bb4fb69
Binary files /dev/null and b/docs/gradient_descent/images/arbitrary.png differ
diff --git a/docs/gradient_descent/images/convex.png b/docs/gradient_descent/images/convex.png
new file mode 100644
index 000000000..72bf6a47a
Binary files /dev/null and b/docs/gradient_descent/images/convex.png differ
diff --git a/docs/gradient_descent/images/grad_descent_1.png b/docs/gradient_descent/images/grad_descent_1.png
new file mode 100644
index 000000000..8361821fe
Binary files /dev/null and b/docs/gradient_descent/images/grad_descent_1.png differ
diff --git a/docs/gradient_descent/images/grad_descent_2.png b/docs/gradient_descent/images/grad_descent_2.png
new file mode 100644
index 000000000..9c320b2a8
Binary files /dev/null and b/docs/gradient_descent/images/grad_descent_2.png differ
diff --git a/docs/gradient_descent/images/grad_descent_3.png b/docs/gradient_descent/images/grad_descent_3.png
new file mode 100644
index 000000000..a93a9f67a
Binary files /dev/null and b/docs/gradient_descent/images/grad_descent_3.png differ
diff --git a/docs/gradient_descent/images/local.png b/docs/gradient_descent/images/local.png
new file mode 100644
index 000000000..d753299ad
Binary files /dev/null and b/docs/gradient_descent/images/local.png differ
diff --git a/docs/gradient_descent/images/loss_surface.png b/docs/gradient_descent/images/loss_surface.png
new file mode 100644
index 000000000..47fdc3089
Binary files /dev/null and b/docs/gradient_descent/images/loss_surface.png differ
diff --git a/docs/gradient_descent/images/neg_step.png b/docs/gradient_descent/images/neg_step.png
new file mode 100644
index 000000000..92b4d0e6c
Binary files /dev/null and b/docs/gradient_descent/images/neg_step.png differ
diff --git a/docs/gradient_descent/images/pos_step.png b/docs/gradient_descent/images/pos_step.png
new file mode 100644
index 000000000..61f9ccd84
Binary files /dev/null and b/docs/gradient_descent/images/pos_step.png differ
diff --git a/docs/gradient_descent/images/step.png b/docs/gradient_descent/images/step.png
new file mode 100644
index 000000000..712933064
Binary files /dev/null and b/docs/gradient_descent/images/step.png differ
diff --git a/docs/gradient_descent/images/stochastic.png b/docs/gradient_descent/images/stochastic.png
new file mode 100644
index 000000000..122862722
Binary files /dev/null and b/docs/gradient_descent/images/stochastic.png differ
diff --git a/docs/index.html b/docs/index.html
index 70659ad0a..a6e30b14f 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -132,6 +132,156 @@
   <a href="./intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
diff --git a/docs/inference_causality/images/bootstrap.png b/docs/inference_causality/images/bootstrap.png
new file mode 100644
index 000000000..64330be7e
Binary files /dev/null and b/docs/inference_causality/images/bootstrap.png differ
diff --git a/docs/inference_causality/images/bootstrapped_samples.png b/docs/inference_causality/images/bootstrapped_samples.png
new file mode 100644
index 000000000..7faccee50
Binary files /dev/null and b/docs/inference_causality/images/bootstrapped_samples.png differ
diff --git a/docs/inference_causality/images/confounder.png b/docs/inference_causality/images/confounder.png
new file mode 100644
index 000000000..acc3b1b59
Binary files /dev/null and b/docs/inference_causality/images/confounder.png differ
diff --git a/docs/inference_causality/images/experiment.png b/docs/inference_causality/images/experiment.png
new file mode 100644
index 000000000..735d58d0c
Binary files /dev/null and b/docs/inference_causality/images/experiment.png differ
diff --git a/docs/inference_causality/images/observational.png b/docs/inference_causality/images/observational.png
new file mode 100644
index 000000000..5d1ae856d
Binary files /dev/null and b/docs/inference_causality/images/observational.png differ
diff --git a/docs/inference_causality/images/plover_eggs.jpg b/docs/inference_causality/images/plover_eggs.jpg
new file mode 100644
index 000000000..eb957e921
Binary files /dev/null and b/docs/inference_causality/images/plover_eggs.jpg differ
diff --git a/docs/inference_causality/images/population_samples.png b/docs/inference_causality/images/population_samples.png
new file mode 100644
index 000000000..594a34dbf
Binary files /dev/null and b/docs/inference_causality/images/population_samples.png differ
diff --git a/docs/inference_causality/inference_causality.html b/docs/inference_causality/inference_causality.html
new file mode 100644
index 000000000..51c107f7e
--- /dev/null
+++ b/docs/inference_causality/inference_causality.html
@@ -0,0 +1,2261 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>19&nbsp; Causal Inference and Confounding – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../sql_I/sql_I.html" rel="next">
+<link href="../probability_2/probability_2.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../inference_causality/inference_causality.html"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Causal Inference and Confounding</h2>
+   
+  <ul>
+  <li><a href="#parameter-inference-interpreting-regression-coefficients" id="toc-parameter-inference-interpreting-regression-coefficients" class="nav-link active" data-scroll-target="#parameter-inference-interpreting-regression-coefficients"><span class="header-section-number">19.1</span> Parameter Inference: Interpreting Regression Coefficients</a></li>
+  <li><a href="#review-bootstrap-resampling" id="toc-review-bootstrap-resampling" class="nav-link" data-scroll-target="#review-bootstrap-resampling"><span class="header-section-number">19.2</span> Review: Bootstrap Resampling</a></li>
+  <li><a href="#collinearity" id="toc-collinearity" class="nav-link" data-scroll-target="#collinearity"><span class="header-section-number">19.3</span> Collinearity</a>
+  <ul>
+  <li><a href="#hypothesis-testing-through-bootstrap-snowy-plover-demo" id="toc-hypothesis-testing-through-bootstrap-snowy-plover-demo" class="nav-link" data-scroll-target="#hypothesis-testing-through-bootstrap-snowy-plover-demo"><span class="header-section-number">19.3.1</span> Hypothesis Testing Through Bootstrap: Snowy Plover Demo</a></li>
+  <li><a href="#a-simpler-model" id="toc-a-simpler-model" class="nav-link" data-scroll-target="#a-simpler-model"><span class="header-section-number">19.3.2</span> A Simpler Model</a></li>
+  <li><a href="#reminder-assumptions-matter" id="toc-reminder-assumptions-matter" class="nav-link" data-scroll-target="#reminder-assumptions-matter"><span class="header-section-number">19.3.3</span> Reminder: Assumptions Matter</a></li>
+  </ul></li>
+  <li><a href="#bonus-content" id="toc-bonus-content" class="nav-link" data-scroll-target="#bonus-content"><span class="header-section-number">19.4</span> [Bonus Content]</a>
+  <ul>
+  <li><a href="#prediction-vs-causation" id="toc-prediction-vs-causation" class="nav-link" data-scroll-target="#prediction-vs-causation"><span class="header-section-number">19.4.1</span> Prediction vs Causation</a></li>
+  <li><a href="#confounders" id="toc-confounders" class="nav-link" data-scroll-target="#confounders"><span class="header-section-number">19.4.2</span> Confounders</a></li>
+  <li><a href="#how-to-perform-causal-inference" id="toc-how-to-perform-causal-inference" class="nav-link" data-scroll-target="#how-to-perform-causal-inference"><span class="header-section-number">19.4.3</span> How to perform causal inference?</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<!-- 
+The **bias** of an estimator is how far off it is from the parameter, on average.
+
+$$\begin{align}\text{Bias}(\hat{\theta}) = \mathbb{E}[\hat{\theta} - \theta] = \mathbb{E}[\hat{\theta}] - \theta\end{align}$$
+
+For example, the bias of the sample mean as an estimator of the population mean is:
+
+$$\begin{align}\mathbb{E}[\bar{X}_n - \mu]
+&= \mathbb{E}[\frac{1}{n}\sum_{i=1}^n (X_i)] - \mu \\
+&= \frac{1}{n}\sum_{i=1}^n \mathbb{E}[X_i] - \mu \\
+&= \frac{1}{n} (n\mu) - \mu \\
+&= 0\end{align}$$
+
+Because its bias is equal to 0, the sample mean is said to be an **unbiased** estimator of the population mean.
+
+The **variance** of an estimator is a measure of how much the estimator tends to vary from its mean value.
+
+$$\begin{align}\text{Var}(\hat{\theta}) = \mathbb{E}\left[(\hat{\theta} - \mathbb{E}[\hat{\theta}])^2 \right]\end{align}$$
+
+The **mean squared error** measures the "goodness" of an estimator by incorporating both the bias and variance. Formally, it is defined as:
+
+$$\begin{align}\text{MSE}(\hat{\theta}) = \mathbb{E}\left[(\hat{\theta} - \theta)^2
+\right]\end{align}$$ -->
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Construct confidence intervals for hypothesis testing using bootstrapping</li>
+<li>Understand the assumptions we make and their impact on our regression inference</li>
+<li>Explore ways to overcome issues of multicollinearity</li>
+<li>Compare regression correlation and causation</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Last time, we introduced the idea of random variables and how they affect the data and model we construct. We also demonstrated the decomposition of model risk from a fitted model and dived into the bias-variance tradeoff.</p>
+<p>In this lecture, we will explore regression inference via hypothesis testing, understand how to use bootstrapping under the right assumptions, and consider the environment of understanding causality in theory and in practice.</p>
+<section id="parameter-inference-interpreting-regression-coefficients" class="level2" data-number="19.1">
+<h2 data-number="19.1" class="anchored" data-anchor-id="parameter-inference-interpreting-regression-coefficients"><span class="header-section-number">19.1</span> Parameter Inference: Interpreting Regression Coefficients</h2>
+<p>There are two main reasons why we build models:</p>
+<ol type="1">
+<li><strong>Prediction</strong>: using our model to make accurate predictions about unseen data</li>
+<li><strong>Inference</strong>: using our model to draw conclusions about the underlying relationship(s) between our features and response. We want to understand the complex phenomena occurring in the world we live in. While training is the process of fitting a model, inference is the <em>process of making predictions</em>.</li>
+</ol>
+<p>Recall the framework we established in the last lecture. The relationship between datapoints is given by <span class="math inline">\(Y = g(x) + \epsilon\)</span>, where <span class="math inline">\(g(x)\)</span> is the <em>true underlying relationship</em>, and <span class="math inline">\(\epsilon\)</span> represents randomness. If we assume <span class="math inline">\(g(x)\)</span> is linear, we can express this relationship in terms of the unknown, true model parameters <span class="math inline">\(\theta\)</span>.</p>
+<p><span class="math display">\[f_{\theta}(x) = g(x) + \epsilon = \theta_0 + \theta_1 x_1 + \ldots + \theta_p x_p + \epsilon\]</span></p>
+<p>Our model attempts to estimate each true population parameter <span class="math inline">\(\theta_i\)</span> using the sample estimates <span class="math inline">\(\hat{\theta}_i\)</span> calculated from the design matrix <span class="math inline">\(\Bbb{X}\)</span> and response vector <span class="math inline">\(\Bbb{Y}\)</span>.</p>
+<p><span class="math display">\[f_{\hat{\theta}}(x) = \hat{\theta}_0 + \hat{\theta}_1 x_1 + \ldots + \hat{\theta}_p x_p\]</span></p>
+<p>Let’s pause for a moment. At this point, we’re very used to working with the idea of a model parameter. But what exactly does each coefficient <span class="math inline">\(\theta_i\)</span> actually <em>mean</em>? We can think of each <span class="math inline">\(\theta_i\)</span> as a <em>slope</em> of the linear model. If all other variables are held constant, a unit change in <span class="math inline">\(x_i\)</span> will result in a <span class="math inline">\(\theta_i\)</span> change in <span class="math inline">\(f_{\theta}(x)\)</span>. Broadly speaking, a large value of <span class="math inline">\(\theta_i\)</span> means that the feature <span class="math inline">\(x_i\)</span> has a large effect on the response; conversely, a small value of <span class="math inline">\(\theta_i\)</span> means that <span class="math inline">\(x_i\)</span> has little effect on the response. In the extreme case, if the true parameter <span class="math inline">\(\theta_i\)</span> is 0, then the feature <span class="math inline">\(x_i\)</span> has <strong>no effect</strong> on <span class="math inline">\(Y(x)\)</span>.</p>
+<p>If the true parameter <span class="math inline">\(\theta_i\)</span> for a particular feature is 0, this tells us something pretty significant about the world: there is no underlying relationship between <span class="math inline">\(x_i\)</span> and <span class="math inline">\(Y(x)\)</span>! But how can we test if a parameter is actually 0? As a baseline, we go through our usual process of drawing a sample, using this data to fit a model, and computing an estimate <span class="math inline">\(\hat{\theta}_i\)</span>. However, we also need to consider that if our random sample comes out differently, we may find a different result for <span class="math inline">\(\hat{\theta}_i\)</span>. To infer if the true parameter <span class="math inline">\(\theta_i\)</span> is 0, we want to draw our conclusion from the distribution of <span class="math inline">\(\hat{\theta}_i\)</span> estimates we could have drawn across all other random samples. This is where <a href="https://inferentialthinking.com/chapters/11/Testing_Hypotheses.html">hypothesis testing</a> comes in handy!</p>
+<p>To test if the true parameter <span class="math inline">\(\theta_i\)</span> is 0, we construct a <strong>hypothesis test</strong> where our null hypothesis states that the true parameter <span class="math inline">\(\theta_i\)</span> is 0, and the alternative hypothesis states that the true parameter <span class="math inline">\(\theta_i\)</span> is <em>not</em> 0. If our p-value is smaller than our cutoff value (usually p = 0.05), we reject the null hypothesis in favor of the alternative hypothesis.</p>
+</section>
+<section id="review-bootstrap-resampling" class="level2" data-number="19.2">
+<h2 data-number="19.2" class="anchored" data-anchor-id="review-bootstrap-resampling"><span class="header-section-number">19.2</span> Review: Bootstrap Resampling</h2>
+<p>To determine the properties (e.g., variance) of the sampling distribution of an estimator, we’d need access to the population. Ideally, we’d want to consider all possible samples in the population, compute an estimate for each sample, and study the distribution of those estimates.</p>
+<p align="center">
+<img src="images/population_samples.png" alt="y_hat" width="650">
+</p>
+<p>However, this can be quite expensive and time-consuming. Even more importantly, we don’t have access to the population —— we only have <em>one</em> random sample from the population. How can we consider all possible samples if we only have one?</p>
+<p>Bootstrapping comes in handy here! With bootstrapping, we treat our random sample as a “population” and resample from it <em>with replacement</em>. Intuitively, a random sample resembles the population (if it is big enough), so a random <em>resample</em> also resembles a random sample of the population. When sampling, there are a couple things to keep in mind:</p>
+<ul>
+<li>We need to sample the same way we constructed the original sample. Typically, this involves taking a simple random sample with replacement.</li>
+<li>New samples must be the same size as the original sample. We need to accurately model the variability of our estimates.</li>
+</ul>
+<div class="callout callout-style-default callout-warning no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-2-contents" aria-controls="callout-2" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Why must we resample <em>with replacement</em>?
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-2" class="callout-2-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<p>Given an original sample of size <span class="math inline">\(n\)</span>, we want a resample that has the same size <span class="math inline">\(n\)</span> as the original. Sampling <em>without</em> replacement will give us the original sample with shuffled rows. Hence, when we calculate summary statistics like the average, our sample <em>without</em> replacement will always have the same average as the original sample, defeating the purpose of a bootstrap.</p>
+</div>
+</div>
+</div>
+<p align="center">
+<img src="images/bootstrap.png" alt="y_hat" width="700">
+</p>
+<p>Bootstrap resampling is a technique for estimating the sampling distribution of an estimator. To execute it, we can follow the pseudocode below:</p>
+<pre><code>collect a random sample of size n (called the bootstrap population)
+
+initiate a list of estimates
+
+repeat 10,000 times:
+    resample with replacement from the bootstrap population
+    apply estimator f to the resample
+    store in list
+
+list of estimates is the bootstrapped sampling distribution of f</code></pre>
+<p>How well does bootstrapping actually represent our population? The bootstrapped sampling distribution of an estimator does not exactly match the sampling distribution of that estimator, but it is often close. Similarly, the variance of the bootstrapped distribution is often close to the true variance of the estimator. The example below displays the results of different bootstraps from a <em>known</em> population using a sample size of <span class="math inline">\(n=50\)</span>.</p>
+<p align="center">
+<img src="images/bootstrapped_samples.png" alt="y_hat" width="600">
+</p>
+<p>In the real world, we don’t know the population distribution. The center of the bootstrapped distribution is the estimator applied to our original sample, so we have no way of understanding the estimator’s true expected value; the center and spread of our bootstrap are <em>approximations</em>. The quality of our bootstrapped distribution also depends on the quality of our original sample. If our original sample was not representative of the population (like Sample 5 in the image above), then the bootstrap is next to useless. In general, bootstrapping works better for <em>large samples</em>, when the population distribution is <em>not heavily skewed</em> (no outliers), and when the estimator is <em>“low variance”</em> (insensitive to extreme values).</p>
+<!-- #### TODO: Good to include this example but make sure to integrate well with the following example and ensure it flows. Following example is explained under the assumption that people haven't seen bootstrapping example before.
+
+### Simple Bootstrap Example
+To get a better idea of how bootstrapping works in practice, let's walk through a simple example of bootstrapping to estimate the relationship between miles per gallon and the weight of a vehicle.
+
+Suppose we collected a sample of 20 cars from a population. For the purposes of this demo, we will assume that the seaborn dataset represents the entire population. The following is a visualization of our sample:
+
+#| code-fold: true
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import sklearn.linear_model as lm
+import seaborn as sns
+
+np.random.seed(42)
+mpg_sample = sns.load_dataset('mpg').sample(20)
+px.scatter(mpg_sample, x='weight', y='mpg', trendline='ols', width=800)
+
+Fitting a linear model, we get an estimate of the slope:
+
+#| code-fold: false
+model = lm.LinearRegression().fit(mpg_sample[['weight']], mpg_sample['mpg'])
+model.coef_[0] 
+
+#### Bootstrap Implementation
+We can use bootstrapping to estimate the distribution of that coefficient. Here we construct a bootstrap function that takes an estimator function and uses that function to construct many bootstrap estimates of the slope.
+
+#| code-fold: false
+def estimator(sample):
+    model = lm.LinearRegression().fit(sample[['weight']], sample['mpg'])
+    return model.coef_[0]
+    
+The code below uses `df.sample` [(documentation)](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html) to generate a bootstrap sample that is the same size as the original sample.
+
+#| code-fold: false
+def bootstrap(sample, statistic, num_repetitions):
+    """
+    Returns the statistic computed on a num_repetitions  
+    bootstrap samples from sample.
+    """
+    stats = []
+    for i in np.arange(num_repetitions):
+        # Step 1: Sample the Sample
+        bootstrap_sample = sample.sample(frac=1, replace=True)
+        # Step 2: compute statistics on the sample of the sample
+        bootstrap_stat = statistic(bootstrap_sample)
+        # Accumulate the statistics
+        stats.append(bootstrap_stat)
+    return stats    
+
+After constructing many bootstrap slope estimates (in this case, 10,000), we can visualize the distribution of these estimates.
+
+#| code-fold: true
+#Construct 10,000 bootstrap slope estimates
+bs_thetas = bootstrap(mpg_sample, estimator, 10000)
+
+#Visualize the distribution of these estimates
+px.histogram(bs_thetas, title='Bootstrap Distribution of the Slope', 
+             width=800)
+
+#### Computing a Bootstrap CI
+We can now compute the confidence interval for the slopes using the percentiles of the empirical distribution. Here, we are looking for a 95% confidence interval, so we want values at the 2.5 and 97.5 percentiles of the bootstrap samples to be the bounds of our interval. To find the interval, we can use the function defined below:
+
+#| code-fold: true
+def bootstrap_ci(bootstrap_samples, confidence_level=95):
+    """
+    Returns the confidence interval for the bootstrap samples.
+    """
+    lower_percentile = (100 - confidence_level) / 2
+    upper_percentile = 100 - lower_percentile
+    return np.percentile(bootstrap_samples, [lower_percentile, upper_percentile])
+print(bootstrap_ci(bs_thetas))
+
+#### Comparing to the Population CIs
+In practice, you don't have access to the population. In this example, we took a sample from a larger dataset that we can treat as the population. Let's compare our results to what they would be if we had resampled from the larger dataset. Here is the 95% confidence interval for the slope when sampling 10,000 times from the entire dataset:
+
+#| code-fold: true
+mpg_pop = sns.load_dataset('mpg')
+theta_est = [estimator(mpg_pop.sample(20)) for i in range(10000)]
+print(bootstrap_ci(theta_est))
+
+Visualizing the two distributions:
+#| code-fold: true
+thetas = pd.DataFrame({"bs_thetas": bs_thetas, "thetas": theta_est})
+px.histogram(thetas.melt(), x='value', facet_row='variable', 
+             title='Distribution of the Slope', width=800)
+
+Although our bootstrapped sample distribution does not exactly match the sampling distribution of the population, we can see that it is relatively close. This demonstrates the benefit of bootstrapping —— without knowing the actual population distribution, we can still roughly approximate the true slope for the model by using only a single random sample of 20 cars.
+-->
+<p>Although our bootstrapped sample distribution does not exactly match the sampling distribution of the population, we can see that it is relatively close. This demonstrates the benefit of bootstrapping —— without knowing the actual population distribution, we can still roughly approximate the true slope for the model by using only a single random sample of 20 cars.</p>
+<!-- #### PurpleAir (chose to skip this section because it's too complex for the amount of pedagogical value it adds)
+To show an example of this hypothesis testing process, we'll work with air quality measurement data. There are 2 common sources of air quality information: Air Quality System (AQS) and [PurpleAir sensors](https://www2.purpleair.com/). AQS is seen as the gold standard because it is high quality, well-calibrated, and publicly available. However, it is very expensive, and the sensors are far apart; reports are also delayed due to extensive calibration.  
+
+On the other hand, PurpleAir (PA) sensors are much cheaper, easier to install, and has denser coverage (measurements are taken every 2 minutes). Unfortunately, its measurements are much less accurate than AQS. 
+
+For this demo, our goal is to use AQS sensor measurements to improve PurpleAir measurements by training a model that adjusts PA measurements based on AQS measurements
+
+$$PA \approx \theta_0 + \theta_1 AQS$$
+
+Using this approximation, we'll invert the model to predict the true air quality from PA measurements
+$$ \text{True Air Quality } \approx -\frac{\theta_0}{\theta_1} + \frac{1}{\theta_1} PA$$
+
+::: {.callout-tip collapse="false"}
+### Inverse Model Derivation 
+Intuitively, AQS measurements are very accurate, so we can treat AQS as the true air quality: 
+$AQS = \text{True Air Quality}$
+
+$$
+\begin{align}
+PA &\approx \theta_0 + \theta_1 AQS \\
+&\approx \theta_0 + \theta_1 \text{True Air Quality} \\
+PA - \theta_0 &\approx + \theta_1 \text{True Air Quality} \\
+\frac{PA - \theta_0}{\theta_1} &\approx \text{True Air Quality} \\
+\text{True Air Quality } &\approx -\frac{\theta_0}{\theta_1} + \frac{1}{\theta_1} PA 
+\end{align}
+$$
+:::
+
+#| code-fold: true
+import numpy as np
+import pandas as pd
+import matplotlib
+import matplotlib.pyplot as plt
+import seaborn as sns
+import sklearn.linear_model as lm
+from sklearn.linear_model import LinearRegression
+
+# big font helper
+def adjust_fontsize(size=None):
+    SMALL_SIZE = 8
+    MEDIUM_SIZE = 10
+    BIGGER_SIZE = 12
+    if size != None:
+        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size
+
+    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
+    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
+    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
+    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
+    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
+
+plt.style.use('fivethirtyeight')
+sns.set_context("talk")
+sns.set_theme()
+#plt.style.use('default') # revert style to default mpl
+adjust_fontsize(size=20)
+%matplotlib inline
+csv_file = 'data/Full24hrdataset.csv'
+usecols = ['Date', 'ID', 'region', 'PM25FM', 'PM25cf1', 'TempC', 'RH', 'Dewpoint']
+full_df = (pd.read_csv(csv_file, usecols=usecols, parse_dates=['Date'])
+        .dropna())
+full_df.columns = ['date', 'id', 'region', 'pm25aqs', 'pm25pa', 'temp', 'rh', 'dew']
+full_df = full_df.loc[(full_df['pm25aqs'] < 50)]
+
+
+bad_dates = ['2019-08-21', '2019-08-22', '2019-09-24']
+GA = full_df.loc[(full_df['id'] == 'GA1') & (~full_df['date'].isin(bad_dates)) , :]
+AQS, PA = GA[['pm25aqs']], GA['pm25pa']
+AQS.head()
+pd.DataFrame(PA).head()
+-->
+</section>
+<section id="collinearity" class="level2" data-number="19.3">
+<h2 data-number="19.3" class="anchored" data-anchor-id="collinearity"><span class="header-section-number">19.3</span> Collinearity</h2>
+<section id="hypothesis-testing-through-bootstrap-snowy-plover-demo" class="level3" data-number="19.3.1">
+<h3 data-number="19.3.1" class="anchored" data-anchor-id="hypothesis-testing-through-bootstrap-snowy-plover-demo"><span class="header-section-number">19.3.1</span> Hypothesis Testing Through Bootstrap: Snowy Plover Demo</h3>
+<p>We can conduct the hypothesis testing described earlier through <strong>bootstrapping</strong> (this equivalence can be proven through the <a href="https://stats.stackexchange.com/questions/179902/confidence-interval-p-value-duality-vs-frequentist-interpretation-of-cis">duality argument</a>, which is out of scope for this class). We use bootstrapping to compute approximate 95% confidence intervals for each <span class="math inline">\(\theta_i\)</span>. If the interval doesn’t contain 0, we reject the null hypothesis at the p=5% level. Otherwise, the data is consistent with the null, as the true parameter <em>could possibly</em> be 0.</p>
+<p>To show an example of this hypothesis testing process, we’ll work with the <a href="https://www.audubon.org/field-guide/bird/snowy-plover">snowy plover</a> dataset throughout this section. The data are about the eggs and newly hatched chicks of the Snowy Plover. The data were collected at the Point Reyes National Seashore by a former <a href="https://openlibrary.org/books/OL2038693M/BLSS_the_Berkeley_interactive_statistical_system">student at Berkeley</a>. Here’s a <a href="http://cescos.fau.edu/jay/eps/articles/snowyplover.html">parent bird and some eggs</a>.</p>
+<p align="center">
+<img src="images/plover_eggs.jpg" alt="bvt" width="550">
+</p>
+<p>Note that <code>Egg Length</code> and <code>Egg Breadth</code> (widest diameter) are measured in millimeters, and <code>Egg Weight</code> and <code>Bird Weight</code> are measured in grams. For reference, a standard paper clip weighs about one gram.</p>
+<div id="536eb17e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>eggs <span class="op">=</span> pd.read_csv(<span class="st">"data/snowy_plover.csv"</span>)</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>eggs.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="1">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">egg_weight</th>
+<th data-quarto-table-cell-role="th">egg_length</th>
+<th data-quarto-table-cell-role="th">egg_breadth</th>
+<th data-quarto-table-cell-role="th">bird_weight</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>7.4</td>
+<td>28.80</td>
+<td>21.84</td>
+<td>5.2</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>7.7</td>
+<td>29.04</td>
+<td>22.45</td>
+<td>5.4</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>7.9</td>
+<td>29.36</td>
+<td>22.48</td>
+<td>5.6</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>7.5</td>
+<td>30.10</td>
+<td>21.71</td>
+<td>5.3</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>8.3</td>
+<td>30.17</td>
+<td>22.75</td>
+<td>5.9</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Our goal will be to predict the weight of a newborn plover chick, which we assume follows the true relationship <span class="math inline">\(Y = f_{\theta}(x)\)</span> below.</p>
+<p><span class="math display">\[\text{bird\_weight} = \theta_0 + \theta_1 \text{egg\_weight} + \theta_2 \text{egg\_length} + \theta_3 \text{egg\_breadth} + \epsilon\]</span></p>
+<p>Note that for each <span class="math inline">\(i\)</span>, the parameter <span class="math inline">\(\theta_i\)</span> is a fixed number, but it is unobservable. We can only estimate it. The random error <span class="math inline">\(\epsilon\)</span> is also unobservable, but it is assumed to have expectation 0 and be independent and identically distributed across eggs.</p>
+<p>Say we wish to determine if the <code>egg_weight</code> impacts the <code>bird_weight</code> of a chick – we want to infer if <span class="math inline">\(\theta_1\)</span> is equal to 0.</p>
+<p>First, we define our hypotheses:</p>
+<ul>
+<li><strong>Null hypothesis</strong>: the true parameter <span class="math inline">\(\theta_1\)</span> is 0; any variation is due to random chance.</li>
+<li><strong>Alternative hypothesis</strong>: the true parameter <span class="math inline">\(\theta_1\)</span> is not 0.</li>
+</ul>
+<p>Next, we use our data to fit a model <span class="math inline">\(\hat{Y} = f_{\hat{\theta}}(x)\)</span> that approximates the relationship above. This gives us the <strong>observed value</strong> of <span class="math inline">\(\hat{\theta}_1\)</span> from our data.</p>
+<div id="5bc30b57" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> eggs[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> eggs[<span class="st">"bird_weight"</span>]</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>model.fit(X, Y)</span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a><span class="co"># This gives an array containing the fitted model parameter estimates</span></span>
+<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> model.coef_</span>
+<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Put the parameter estimates in a nice table for viewing</span></span>
+<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>display(pd.DataFrame(</span>
+<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>  [model.intercept_] <span class="op">+</span> <span class="bu">list</span>(model.coef_),</span>
+<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>  columns<span class="op">=</span>[<span class="st">'theta_hat'</span>],</span>
+<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>  index<span class="op">=</span>[<span class="st">'intercept'</span>, <span class="st">'egg_weight'</span>, <span class="st">'egg_length'</span>, <span class="st">'egg_breadth'</span>]</span>
+<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a>))</span>
+<span id="cb3-19"><a href="#cb3-19" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-20"><a href="#cb3-20" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"RMSE"</span>, np.mean((Y <span class="op">-</span> model.predict(X)) <span class="op">**</span> <span class="dv">2</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">theta_hat</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">intercept</td>
+<td>-4.605670</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">egg_weight</td>
+<td>0.431229</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">egg_length</td>
+<td>0.066570</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">egg_breadth</td>
+<td>0.215914</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+<div class="cell-output cell-output-stdout">
+<pre><code>RMSE 0.045470853802757547</code></pre>
+</div>
+</div>
+<p>Our single sample of data gives us the value of <span class="math inline">\(\hat{\theta}_1=0.431\)</span>. To get a sense of how this estimate might vary if we were to draw different random samples, we will use <a href="https://inferentialthinking.com/chapters/13/2/Bootstrap.html?">bootstrapping</a>. As a refresher, to construct a bootstrap sample, we will draw a resample from the collected data that:</p>
+<ul>
+<li>Has the same sample size as the collected data</li>
+<li>Is drawn with replacement (this ensures that we don’t draw the exact same sample every time!)</li>
+</ul>
+<p>We draw a bootstrap sample, use this sample to fit a model, and record the result for <span class="math inline">\(\hat{\theta}_1\)</span> on this bootstrapped sample. We then repeat this process many times to generate a <strong>bootstrapped empirical distribution</strong> of <span class="math inline">\(\hat{\theta}_1\)</span>. This gives us an estimate of what the true distribution of <span class="math inline">\(\hat{\theta}_1\)</span> across all possible samples might look like.</p>
+<div id="534b84a9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Set a random seed so you generate the same random sample as staff</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="co"># In the "real world", we wouldn't do this</span></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the sample size of each bootstrap sample</span></span>
+<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(eggs)</span>
+<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a list to store all the bootstrapped estimates</span></span>
+<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>estimates <span class="op">=</span> []</span>
+<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate a bootstrap resample from `eggs` and find an estimate for theta_1 using this sample. </span></span>
+<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Repeat 10000 times.</span></span>
+<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
+<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a>    <span class="co"># draw a bootstrap sample</span></span>
+<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a>    X_bootstrap <span class="op">=</span> bootstrap_resample[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
+<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap <span class="op">=</span> bootstrap_resample[<span class="st">"bird_weight"</span>]</span>
+<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a>    <span class="co"># use bootstrapped sample to fit a model</span></span>
+<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a>    bootstrap_model <span class="op">=</span> LinearRegression()</span>
+<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a>    bootstrap_model.fit(X_bootstrap, Y_bootstrap)</span>
+<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a>    bootstrap_thetas <span class="op">=</span> bootstrap_model.coef_</span>
+<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb5-25"><a href="#cb5-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># record the result for theta_1</span></span>
+<span id="cb5-26"><a href="#cb5-26" aria-hidden="true" tabindex="-1"></a>    estimates.append(bootstrap_thetas[<span class="dv">0</span>])</span>
+<span id="cb5-27"><a href="#cb5-27" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb5-28"><a href="#cb5-28" aria-hidden="true" tabindex="-1"></a><span class="co"># calculate the 95% confidence interval </span></span>
+<span id="cb5-29"><a href="#cb5-29" aria-hidden="true" tabindex="-1"></a>lower <span class="op">=</span> np.percentile(estimates, <span class="fl">2.5</span>, axis<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb5-30"><a href="#cb5-30" aria-hidden="true" tabindex="-1"></a>upper <span class="op">=</span> np.percentile(estimates, <span class="fl">97.5</span>, axis<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb5-31"><a href="#cb5-31" aria-hidden="true" tabindex="-1"></a>conf_interval <span class="op">=</span> (lower, upper)</span>
+<span id="cb5-32"><a href="#cb5-32" aria-hidden="true" tabindex="-1"></a>conf_interval</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<pre><code>(np.float64(-0.2586481195684874), np.float64(1.103424385420405))</code></pre>
+</div>
+</div>
+<p>Our bootstrapped 95% confidence interval for <span class="math inline">\(\theta_1\)</span> is <span class="math inline">\([-0.259, 1.103]\)</span>. Immediately, we can see that 0 <em>is</em> indeed contained in this interval – this means that we <em>cannot</em> conclude that <span class="math inline">\(\theta_1\)</span> is non-zero! More formally, we fail to reject the null hypothesis (that <span class="math inline">\(\theta_1\)</span> is 0) under a 5% p-value cutoff.</p>
+<p>We can repeat this process to construct 95% confidence intervals for the other parameters of the model.</p>
+<div id="170c39f5" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>theta_0_estimates <span class="op">=</span> []</span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>theta_1_estimates <span class="op">=</span> []</span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>theta_2_estimates <span class="op">=</span> []</span>
+<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>theta_3_estimates <span class="op">=</span> []</span>
+<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
+<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>    X_bootstrap <span class="op">=</span> bootstrap_resample[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
+<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap <span class="op">=</span> bootstrap_resample[<span class="st">"bird_weight"</span>]</span>
+<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a>    bootstrap_model <span class="op">=</span> LinearRegression()</span>
+<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a>    bootstrap_model.fit(X_bootstrap, Y_bootstrap)</span>
+<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a>    bootstrap_theta_0 <span class="op">=</span> bootstrap_model.intercept_</span>
+<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a>    bootstrap_theta_1, bootstrap_theta_2, bootstrap_theta_3 <span class="op">=</span> bootstrap_model.coef_</span>
+<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a>    theta_0_estimates.append(bootstrap_theta_0)</span>
+<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a>    theta_1_estimates.append(bootstrap_theta_1)</span>
+<span id="cb7-21"><a href="#cb7-21" aria-hidden="true" tabindex="-1"></a>    theta_2_estimates.append(bootstrap_theta_2)</span>
+<span id="cb7-22"><a href="#cb7-22" aria-hidden="true" tabindex="-1"></a>    theta_3_estimates.append(bootstrap_theta_3)</span>
+<span id="cb7-23"><a href="#cb7-23" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb7-24"><a href="#cb7-24" aria-hidden="true" tabindex="-1"></a>theta_0_lower, theta_0_upper <span class="op">=</span> np.percentile(theta_0_estimates, <span class="fl">2.5</span>), np.percentile(theta_0_estimates, <span class="fl">97.5</span>)</span>
+<span id="cb7-25"><a href="#cb7-25" aria-hidden="true" tabindex="-1"></a>theta_1_lower, theta_1_upper <span class="op">=</span> np.percentile(theta_1_estimates, <span class="fl">2.5</span>), np.percentile(theta_1_estimates, <span class="fl">97.5</span>)</span>
+<span id="cb7-26"><a href="#cb7-26" aria-hidden="true" tabindex="-1"></a>theta_2_lower, theta_2_upper <span class="op">=</span> np.percentile(theta_2_estimates, <span class="fl">2.5</span>), np.percentile(theta_2_estimates, <span class="fl">97.5</span>)</span>
+<span id="cb7-27"><a href="#cb7-27" aria-hidden="true" tabindex="-1"></a>theta_3_lower, theta_3_upper <span class="op">=</span> np.percentile(theta_3_estimates, <span class="fl">2.5</span>), np.percentile(theta_3_estimates, <span class="fl">97.5</span>)</span>
+<span id="cb7-28"><a href="#cb7-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-29"><a href="#cb7-29" aria-hidden="true" tabindex="-1"></a><span class="co"># Make a nice table to view results</span></span>
+<span id="cb7-30"><a href="#cb7-30" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"lower"</span>:[theta_0_lower, theta_1_lower, theta_2_lower, theta_3_lower], <span class="st">"upper"</span>:[theta_0_upper, <span class="op">\</span></span>
+<span id="cb7-31"><a href="#cb7-31" aria-hidden="true" tabindex="-1"></a>                theta_1_upper, theta_2_upper, theta_3_upper]}, index<span class="op">=</span>[<span class="st">"theta_0"</span>, <span class="st">"theta_1"</span>, <span class="st">"theta_2"</span>, <span class="st">"theta_3"</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">lower</th>
+<th data-quarto-table-cell-role="th">upper</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">theta_0</td>
+<td>-15.278542</td>
+<td>5.161473</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">theta_1</td>
+<td>-0.258648</td>
+<td>1.103424</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">theta_2</td>
+<td>-0.099138</td>
+<td>0.208557</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">theta_3</td>
+<td>-0.257141</td>
+<td>0.758155</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Something’s off here. Notice that 0 is included in the 95% confidence interval for <em>every</em> parameter of the model. Using the interpretation we outlined above, this would suggest that we can’t say for certain that <em>any</em> of the input variables impact the response variable! This makes it seem like our model can’t make any predictions – and yet, each model we fit in our bootstrap experiment above could very much make predictions of <span class="math inline">\(Y\)</span>.</p>
+<p>How can we explain this result? Think back to how we first interpreted the parameters of a linear model. We treated each <span class="math inline">\(\theta_i\)</span> as a slope, where a unit increase in <span class="math inline">\(x_i\)</span> leads to a <span class="math inline">\(\theta_i\)</span> increase in <span class="math inline">\(Y\)</span>, <strong>if all other variables are held constant</strong>. It turns out that this last assumption is very important. If variables in our model are somehow related to one another, then it might not be possible to have a change in one of them while holding the others constant. This means that our interpretation framework is no longer valid! In the models we fit above, we incorporated <code>egg_length</code>, <code>egg_breadth</code>, and <code>egg_weight</code> as input variables. These variables are very likely related to one another – an egg with large <code>egg_length</code> and <code>egg_breadth</code> will likely be heavy in <code>egg_weight</code>. This means that the model parameters cannot be meaningfully interpreted as slopes.</p>
+<p>To support this conclusion, we can visualize the relationships between our feature variables. Notice the strong positive association between the features.</p>
+<div id="c2bf93f4" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>sns.pairplot(eggs[[<span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>, <span class="st">"egg_weight"</span>, <span class="st">'bird_weight'</span>]])<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="inference_causality_files/figure-html/cell-6-output-1.png" width="946" height="947" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>This issue is known as <strong>collinearity</strong>, sometimes also called <strong>multicollinearity</strong>. Collinearity occurs when one feature can be predicted fairly accurately by a linear combination of the other features, which happens when one feature is highly correlated with the others.</p>
+<p>Why is collinearity a problem? Its consequences span several aspects of the modeling process:</p>
+<ul>
+<li><strong>Inference</strong>: Slopes can’t be interpreted for an inference task.</li>
+<li><strong>Model Variance</strong>: If features strongly influence one another, even small changes in the sampled data can lead to large changes in the estimated slopes.</li>
+<li><strong>Unique Solution</strong>: If one feature is a linear combination of the other features, the design matrix will not be full rank, and <span class="math inline">\(\mathbb{X}^{\top}\mathbb{X}\)</span> is not invertible. This means that least squares does not have a unique solution. See <a href="https://ds100.org/course-notes/ols/ols.html#bonus-uniqueness-of-the-solution">this section</a> of Course Note 12 for more on this.</li>
+</ul>
+<p>The take-home point is that we need to be careful with what features we select for modeling. If two features likely encode similar information, it is often a good idea to choose only one of them as an input variable.</p>
+</section>
+<section id="a-simpler-model" class="level3" data-number="19.3.2">
+<h3 data-number="19.3.2" class="anchored" data-anchor-id="a-simpler-model"><span class="header-section-number">19.3.2</span> A Simpler Model</h3>
+<p>Let us now consider a more interpretable model: we instead assume a true relationship using only egg weight:</p>
+<p><span class="math display">\[f_\theta(x) = \theta_0 + \theta_1 \text{egg\_weight} + \epsilon\]</span></p>
+<div id="4e46aff8" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="6">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>X_int <span class="op">=</span> eggs[[<span class="st">"egg_weight"</span>]]</span>
+<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>Y_int <span class="op">=</span> eggs[<span class="st">"bird_weight"</span>]</span>
+<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>model_int <span class="op">=</span> LinearRegression()</span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>model_int.fit(X_int, Y_int)</span>
+<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a><span class="co"># This gives an array containing the fitted model parameter estimates</span></span>
+<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a>thetas_int <span class="op">=</span> model_int.coef_</span>
+<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Put the parameter estimates in a nice table for viewing</span></span>
+<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"theta_hat"</span>:[model_int.intercept_, thetas_int[<span class="dv">0</span>]]}, index<span class="op">=</span>[<span class="st">"theta_0"</span>, <span class="st">"theta_1"</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">theta_hat</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">theta_0</td>
+<td>-0.058272</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">theta_1</td>
+<td>0.718515</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="3c29d83a" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="7">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Set a random seed so you generate the same random sample as staff</span></span>
+<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="co"># In the "real world", we wouldn't do this</span></span>
+<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
+<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the sample size of each bootstrap sample</span></span>
+<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(eggs)</span>
+<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a list to store all the bootstrapped estimates</span></span>
+<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a>estimates_int <span class="op">=</span> []</span>
+<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-13"><a href="#cb10-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate a bootstrap resample from `eggs` and find an estimate for theta_1 using this sample. </span></span>
+<span id="cb10-14"><a href="#cb10-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Repeat 10000 times.</span></span>
+<span id="cb10-15"><a href="#cb10-15" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
+<span id="cb10-16"><a href="#cb10-16" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample_int <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb10-17"><a href="#cb10-17" aria-hidden="true" tabindex="-1"></a>    X_bootstrap_int <span class="op">=</span> bootstrap_resample_int[[<span class="st">"egg_weight"</span>]]</span>
+<span id="cb10-18"><a href="#cb10-18" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap_int <span class="op">=</span> bootstrap_resample_int[<span class="st">"bird_weight"</span>]</span>
+<span id="cb10-19"><a href="#cb10-19" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb10-20"><a href="#cb10-20" aria-hidden="true" tabindex="-1"></a>    bootstrap_model_int <span class="op">=</span> LinearRegression()</span>
+<span id="cb10-21"><a href="#cb10-21" aria-hidden="true" tabindex="-1"></a>    bootstrap_model_int.fit(X_bootstrap_int, Y_bootstrap_int)</span>
+<span id="cb10-22"><a href="#cb10-22" aria-hidden="true" tabindex="-1"></a>    bootstrap_thetas_int <span class="op">=</span> bootstrap_model_int.coef_</span>
+<span id="cb10-23"><a href="#cb10-23" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb10-24"><a href="#cb10-24" aria-hidden="true" tabindex="-1"></a>    estimates_int.append(bootstrap_thetas_int[<span class="dv">0</span>])</span>
+<span id="cb10-25"><a href="#cb10-25" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-26"><a href="#cb10-26" aria-hidden="true" tabindex="-1"></a>plt.figure(dpi<span class="op">=</span><span class="dv">120</span>)</span>
+<span id="cb10-27"><a href="#cb10-27" aria-hidden="true" tabindex="-1"></a>sns.histplot(estimates_int, stat<span class="op">=</span><span class="st">"density"</span>)</span>
+<span id="cb10-28"><a href="#cb10-28" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r"$\hat{\theta}_1$"</span>)</span>
+<span id="cb10-29"><a href="#cb10-29" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="vs">r"Bootstrapped estimates $\hat{\theta}_1$ Under the Interpretable Model"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="inference_causality_files/figure-html/cell-8-output-1.png" width="720" height="578" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Notice how the interpretable model performs almost as well as our other model:</p>
+<div id="38f4c5d1" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="8">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> mean_squared_error</span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>rmse <span class="op">=</span> mean_squared_error(Y, model.predict(X))</span>
+<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>rmse_int <span class="op">=</span> mean_squared_error(Y_int, model_int.predict(X_int))</span>
+<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f'RMSE of Original Model: </span><span class="sc">{</span>rmse<span class="sc">}</span><span class="ss">'</span>)</span>
+<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f'RMSE of Interpretable Model: </span><span class="sc">{</span>rmse_int<span class="sc">}</span><span class="ss">'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>RMSE of Original Model: 0.045470853802757547
+RMSE of Interpretable Model: 0.04649394137555684</code></pre>
+</div>
+</div>
+<p>Yet, the confidence interval for the true parameter <span class="math inline">\(\theta_{1}\)</span> does not contain zero.</p>
+<div id="1bb18a12" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="9">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>lower_int <span class="op">=</span> np.percentile(estimates_int, <span class="fl">2.5</span>)</span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>upper_int <span class="op">=</span> np.percentile(estimates_int, <span class="fl">97.5</span>)</span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>conf_interval_int <span class="op">=</span> (lower_int, upper_int)</span>
+<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a>conf_interval_int</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<pre><code>(np.float64(0.6029335250209632), np.float64(0.8208401738546208))</code></pre>
+</div>
+</div>
+<p>In retrospect, it’s no surprise that the weight of an egg best predicts the weight of a newly-hatched chick.</p>
+<p>A model with highly correlated variables prevents us from interpreting how the variables are related to the prediction.</p>
+</section>
+<section id="reminder-assumptions-matter" class="level3" data-number="19.3.3">
+<h3 data-number="19.3.3" class="anchored" data-anchor-id="reminder-assumptions-matter"><span class="header-section-number">19.3.3</span> Reminder: Assumptions Matter</h3>
+<p>Keep the following in mind: All inference assumes that the regression model holds.</p>
+<ul>
+<li>If the model doesn’t hold, the inference might not be valid.</li>
+<li>If the <a href="https://inferentialthinking.com/chapters/13/3/Confidence_Intervals.html?highlight=p%20value%20confidence%20interval#care-in-using-the-bootstrap-percentile-method">assumptions of the bootstrap</a> don’t hold…
+<ul>
+<li>Sample size n is large</li>
+<li>Sample is representative of population distribution (drawn i.i.d., unbiased)</li>
+</ul>
+…then the results of the bootstrap might not be valid.</li>
+</ul>
+</section>
+</section>
+<section id="bonus-content" class="level2" data-number="19.4">
+<h2 data-number="19.4" class="anchored" data-anchor-id="bonus-content"><span class="header-section-number">19.4</span> [Bonus Content]</h2>
+<p>Note: the content in this section is out of scope.</p>
+<!-- ### Correlation vs. Causation
+Let us consider some questions in an arbitrary regression problem. 
+
+What does $\theta_{j}$ mean in our regression?
+
+* Holding other variables fixed, how much should our prediction change with $X_{j}$?
+
+For simple linear regression, this boils down to the correlation coefficient
+
+* Does having more $x$ predict more $y$ (and by how much)? -->
+<section id="prediction-vs-causation" class="level3" data-number="19.4.1">
+<h3 data-number="19.4.1" class="anchored" data-anchor-id="prediction-vs-causation"><span class="header-section-number">19.4.1</span> Prediction vs Causation</h3>
+<p>The difference between correlation/prediction vs.&nbsp;causation is best illustrated through examples.</p>
+<p>Some questions about <strong>correlation / prediction</strong> include:</p>
+<ul>
+<li>Are homes with granite countertops worth more money?</li>
+<li>Is college GPA higher for students who win a certain scholarship?</li>
+<li>Are breastfed babies less likely to develop asthma?</li>
+<li>Do cancer patients given some aggressive treatment have a higher 5-year survival rate?</li>
+<li>Are people who smoke more likely to get cancer?</li>
+</ul>
+<p>While these may sound like causal questions, they are not! Questions about <strong>causality</strong> are about the <strong>effects</strong> of <strong>interventions</strong> (not just passive observation). For example:</p>
+<ul>
+<li>How much do granite countertops <strong>raise</strong> the value of a house?</li>
+<li>Does getting the scholarship <strong>improve</strong> students’ GPAs?</li>
+<li>Does breastfeeding <strong>protect</strong> babies against asthma?</li>
+<li>Does the treatment <strong>improve</strong> cancer survival?</li>
+<li>Does smoking <strong>cause</strong> cancer?</li>
+</ul>
+<p>Note, however, that regression coefficients are sometimes called “effects”, which can be deceptive!</p>
+<p>When using data alone, <strong>predictive questions</strong> (i.e., are breastfed babies healthier?) can be answered, but <strong>causal questions</strong> (i.e., does breastfeeding improve babies’ health?) cannot. The reason for this is that there are many possible causes for our predictive question. For example, possible explanations for why breastfed babies are healthier on average include:</p>
+<ul>
+<li><strong>Causal effect:</strong> breastfeeding makes babies healthier</li>
+<li><strong>Reverse causality:</strong> healthier babies more likely to successfully breastfeed</li>
+<li><strong>Common cause:</strong> healthier / richer parents have healthier babies and are more likely to breastfeed</li>
+</ul>
+<p>We cannot tell which explanations are true (or to what extent) just by observing (<span class="math inline">\(x\)</span>,<span class="math inline">\(y\)</span>) pairs. Additionally, causal questions implicitly involve <strong>counterfactuals</strong>, events that didn’t happen. For example, we could ask, <strong>would</strong> the <strong>same</strong> breastfed babies have been less healthy <strong>if</strong> they hadn’t been breastfed? Explanation 1 from above implies they would be, but explanations 2 and 3 do not.</p>
+</section>
+<section id="confounders" class="level3" data-number="19.4.2">
+<h3 data-number="19.4.2" class="anchored" data-anchor-id="confounders"><span class="header-section-number">19.4.2</span> Confounders</h3>
+<p>Let T represent a treatment (for example, alcohol use) and Y represent an outcome (for example, lung cancer).</p>
+<p><img src="images/confounder.png" alt="confounder" width="600"></p>
+<p>A <strong>confounder</strong> is a variable that affects both T and Y, distorting the correlation between them. Using the example above, rich parents could be a confounder for breastfeeding and a baby’s health. Confounders can be a measured covariate (a feature) or an unmeasured variable we don’t know about, and they generally cause problems, as the relationship between T and Y is affected by data we cannot see. We commonly <em>assume that all confounders are observed</em> (this is also called <strong>ignorability</strong>).</p>
+</section>
+<section id="how-to-perform-causal-inference" class="level3" data-number="19.4.3">
+<h3 data-number="19.4.3" class="anchored" data-anchor-id="how-to-perform-causal-inference"><span class="header-section-number">19.4.3</span> How to perform causal inference?</h3>
+<p>In a <strong>randomized experiment</strong>, participants are randomly assigned into two groups: treatment and control. A treatment is applied <em>only</em> to the treatment group. We assume ignorability and gather as many measurements as possible so that we can compare them between the control and treatment groups to determine whether or not the treatment has a true effect or is just a confounding factor.</p>
+<p><img src="images/experiment.png" alt="experiment" width="600"></p>
+<p>However, often, randomly assigning treatments is impractical or unethical. For example, assigning a treatment of cigarettes to test the effect of smoking on the lungs would not only be impractical but also unethical.</p>
+<p>An alternative to bypass this issue is to utilize <strong>observational studies</strong>. This can be done by obtaining two participant groups separated based on some identified treatment variable. Unlike randomized experiments, however, we cannot assume ignorability here: the participants could have separated into two groups based on other covariates! In addition, there could also be unmeasured confounders.</p>
+<p><img src="images/observational.png" alt="observational" width="600"></p>
+
+
+<!-- -->
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../probability_2/probability_2.html" class="pagination-link" aria-label="Estimators, Bias, and Variance">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../sql_I/sql_I.html" class="pagination-link" aria-label="SQL I">
+        <span class="nav-page-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb15" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Causal Inference and Confounding</span></span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
+<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Causal Inference and Confounding</span></span>
+<span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb15-19"><a href="#cb15-19" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb15-20"><a href="#cb15-20" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb15-21"><a href="#cb15-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb15-22"><a href="#cb15-22" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb15-23"><a href="#cb15-23" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb15-24"><a href="#cb15-24" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb15-25"><a href="#cb15-25" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb15-26"><a href="#cb15-26" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb15-27"><a href="#cb15-27" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb15-28"><a href="#cb15-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-29"><a href="#cb15-29" aria-hidden="true" tabindex="-1"></a><span class="co">&lt;!-- </span></span>
+<span id="cb15-30"><a href="#cb15-30" aria-hidden="true" tabindex="-1"></a><span class="co">The **bias** of an estimator is how far off it is from the parameter, on average.</span></span>
+<span id="cb15-31"><a href="#cb15-31" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-32"><a href="#cb15-32" aria-hidden="true" tabindex="-1"></a><span class="co">$$\begin{align}\text{Bias}(\hat{\theta}) = \mathbb{E}[\hat{\theta} - \theta] = \mathbb{E}[\hat{\theta}] - \theta\end{align}$$</span></span>
+<span id="cb15-33"><a href="#cb15-33" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-34"><a href="#cb15-34" aria-hidden="true" tabindex="-1"></a><span class="co">For example, the bias of the sample mean as an estimator of the population mean is:</span></span>
+<span id="cb15-35"><a href="#cb15-35" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-36"><a href="#cb15-36" aria-hidden="true" tabindex="-1"></a><span class="co">$$\begin{align}\mathbb{E}[\bar{X}_n - \mu]</span></span>
+<span id="cb15-37"><a href="#cb15-37" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;= \mathbb{E}[\frac{1}{n}\sum_{i=1}^n (X_i)] - \mu \\</span></span>
+<span id="cb15-38"><a href="#cb15-38" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;= \frac{1}{n}\sum_{i=1}^n \mathbb{E}[X_i] - \mu \\</span></span>
+<span id="cb15-39"><a href="#cb15-39" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;= \frac{1}{n} (n\mu) - \mu \\</span></span>
+<span id="cb15-40"><a href="#cb15-40" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;= 0\end{align}$$</span></span>
+<span id="cb15-41"><a href="#cb15-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-42"><a href="#cb15-42" aria-hidden="true" tabindex="-1"></a><span class="co">Because its bias is equal to 0, the sample mean is said to be an **unbiased** estimator of the population mean.</span></span>
+<span id="cb15-43"><a href="#cb15-43" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-44"><a href="#cb15-44" aria-hidden="true" tabindex="-1"></a><span class="co">The **variance** of an estimator is a measure of how much the estimator tends to vary from its mean value.</span></span>
+<span id="cb15-45"><a href="#cb15-45" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-46"><a href="#cb15-46" aria-hidden="true" tabindex="-1"></a><span class="co">$$\begin{align}\text{Var}(\hat{\theta}) = \mathbb{E}\left[(\hat{\theta} - \mathbb{E}[\hat{\theta}])^2 \right]\end{align}$$</span></span>
+<span id="cb15-47"><a href="#cb15-47" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-48"><a href="#cb15-48" aria-hidden="true" tabindex="-1"></a><span class="co">The **mean squared error** measures the "goodness" of an estimator by incorporating both the bias and variance. Formally, it is defined as:</span></span>
+<span id="cb15-49"><a href="#cb15-49" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-50"><a href="#cb15-50" aria-hidden="true" tabindex="-1"></a><span class="co">$$\begin{align}\text{MSE}(\hat{\theta}) = \mathbb{E}\left[(\hat{\theta} - \theta)^2</span></span>
+<span id="cb15-51"><a href="#cb15-51" aria-hidden="true" tabindex="-1"></a><span class="co">\right]\end{align}$$ --&gt;</span></span>
+<span id="cb15-52"><a href="#cb15-52" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-53"><a href="#cb15-53" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-54"><a href="#cb15-54" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb15-55"><a href="#cb15-55" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb15-56"><a href="#cb15-56" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Construct confidence intervals for hypothesis testing using bootstrapping</span>
+<span id="cb15-57"><a href="#cb15-57" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Understand the assumptions we make and their impact on our regression inference</span>
+<span id="cb15-58"><a href="#cb15-58" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Explore ways to overcome issues of multicollinearity</span>
+<span id="cb15-59"><a href="#cb15-59" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Compare regression correlation and causation</span>
+<span id="cb15-60"><a href="#cb15-60" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb15-61"><a href="#cb15-61" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-62"><a href="#cb15-62" aria-hidden="true" tabindex="-1"></a>Last time, we introduced the idea of random variables and how they affect the data and model we construct.</span>
+<span id="cb15-63"><a href="#cb15-63" aria-hidden="true" tabindex="-1"></a>We also demonstrated the decomposition of model risk from a fitted model and dived into the bias-variance tradeoff.</span>
+<span id="cb15-64"><a href="#cb15-64" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-65"><a href="#cb15-65" aria-hidden="true" tabindex="-1"></a>In this lecture, we will explore regression inference via hypothesis testing, understand how to use bootstrapping under the right assumptions, and consider the environment of understanding causality in theory and in practice.</span>
+<span id="cb15-66"><a href="#cb15-66" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-67"><a href="#cb15-67" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-68"><a href="#cb15-68" aria-hidden="true" tabindex="-1"></a><span class="fu">## Parameter Inference: Interpreting Regression Coefficients</span></span>
+<span id="cb15-69"><a href="#cb15-69" aria-hidden="true" tabindex="-1"></a>There are two main reasons why we build models: </span>
+<span id="cb15-70"><a href="#cb15-70" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-71"><a href="#cb15-71" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>**Prediction**: using our model to make accurate predictions about unseen data</span>
+<span id="cb15-72"><a href="#cb15-72" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>**Inference**: using our model to draw conclusions about the underlying relationship(s) between our features and response. We want to understand the complex phenomena occurring in the world we live in. While training is the process of fitting a model, inference is the *process of making predictions*.</span>
+<span id="cb15-73"><a href="#cb15-73" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-74"><a href="#cb15-74" aria-hidden="true" tabindex="-1"></a>Recall the framework we established in the last lecture. The relationship between datapoints is given by $Y = g(x) + \epsilon$, where $g(x)$ is the *true underlying relationship*, and $\epsilon$ represents randomness. If we assume $g(x)$ is linear, we can express this relationship in terms of the unknown, true model parameters $\theta$.</span>
+<span id="cb15-75"><a href="#cb15-75" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-76"><a href="#cb15-76" aria-hidden="true" tabindex="-1"></a>$$f_{\theta}(x) = g(x) + \epsilon = \theta_0 + \theta_1 x_1 + \ldots + \theta_p x_p + \epsilon$$</span>
+<span id="cb15-77"><a href="#cb15-77" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-78"><a href="#cb15-78" aria-hidden="true" tabindex="-1"></a>Our model attempts to estimate each true population parameter $\theta_i$ using the sample estimates $\hat{\theta}_i$ calculated from the design matrix $\Bbb{X}$ and response vector $\Bbb{Y}$.</span>
+<span id="cb15-79"><a href="#cb15-79" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-80"><a href="#cb15-80" aria-hidden="true" tabindex="-1"></a>$$f_{\hat{\theta}}(x) = \hat{\theta}_0 + \hat{\theta}_1 x_1 + \ldots + \hat{\theta}_p x_p$$</span>
+<span id="cb15-81"><a href="#cb15-81" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-82"><a href="#cb15-82" aria-hidden="true" tabindex="-1"></a>Let's pause for a moment. At this point, we're very used to working with the idea of a model parameter. But what exactly does each coefficient $\theta_i$ actually *mean*? We can think of each $\theta_i$ as a *slope* of the linear model. If all other variables are held constant, a unit change in $x_i$ will result in a $\theta_i$ change in $f_{\theta}(x)$. Broadly speaking, a large value of $\theta_i$ means that the feature $x_i$ has a large effect on the response; conversely, a small value of $\theta_i$ means that $x_i$ has little effect on the response. In the extreme case, if the true parameter $\theta_i$ is 0, then the feature $x_i$ has **no effect** on $Y(x)$. </span>
+<span id="cb15-83"><a href="#cb15-83" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-84"><a href="#cb15-84" aria-hidden="true" tabindex="-1"></a>If the true parameter $\theta_i$ for a particular feature is 0, this tells us something pretty significant about the world: there is no underlying relationship between $x_i$ and $Y(x)$! But how can we test if a parameter is actually 0? As a baseline, we go through our usual process of drawing a sample, using this data to fit a model, and computing an estimate $\hat{\theta}_i$. However, we also need to consider that if our random sample comes out differently, we may find a different result for $\hat{\theta}_i$. To infer if the true parameter $\theta_i$ is 0, we want to draw our conclusion from the distribution of $\hat{\theta}_i$ estimates we could have drawn across all other random samples. This is where <span class="co">[</span><span class="ot">hypothesis testing</span><span class="co">](https://inferentialthinking.com/chapters/11/Testing_Hypotheses.html)</span> comes in handy! </span>
+<span id="cb15-85"><a href="#cb15-85" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-86"><a href="#cb15-86" aria-hidden="true" tabindex="-1"></a>To test if the true parameter $\theta_i$ is 0, we construct a **hypothesis test** where our null hypothesis states that the true parameter $\theta_i$ is 0, and the alternative hypothesis states that the true parameter $\theta_i$ is *not* 0. If our p-value is smaller than our cutoff value (usually p = 0.05), we reject the null hypothesis in favor of the alternative hypothesis. </span>
+<span id="cb15-87"><a href="#cb15-87" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-88"><a href="#cb15-88" aria-hidden="true" tabindex="-1"></a><span class="fu">## Review: Bootstrap Resampling</span></span>
+<span id="cb15-89"><a href="#cb15-89" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-90"><a href="#cb15-90" aria-hidden="true" tabindex="-1"></a>To determine the properties (e.g., variance) of the sampling distribution of an estimator, we’d need access to the population. Ideally, we'd want to consider all possible samples in the population, compute an estimate for each sample, and study the distribution of those estimates.</span>
+<span id="cb15-91"><a href="#cb15-91" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-92"><a href="#cb15-92" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb15-93"><a href="#cb15-93" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/population_samples.png" alt='y_hat' width='650'&gt;</span>
+<span id="cb15-94"><a href="#cb15-94" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb15-95"><a href="#cb15-95" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-96"><a href="#cb15-96" aria-hidden="true" tabindex="-1"></a>However, this can be quite expensive and time-consuming. Even more importantly, we don’t have access to the population —— we only have *one* random sample from the population. How can we consider all possible samples if we only have one?</span>
+<span id="cb15-97"><a href="#cb15-97" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-98"><a href="#cb15-98" aria-hidden="true" tabindex="-1"></a>Bootstrapping comes in handy here! With bootstrapping, we treat our random sample as a "population" and resample from it *with replacement*. Intuitively, a random sample resembles the population (if it is big enough), so a random *resample* also resembles a random sample of the population. When sampling, there are a couple things to keep in mind:</span>
+<span id="cb15-99"><a href="#cb15-99" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-100"><a href="#cb15-100" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>We need to sample the same way we constructed the original sample. Typically, this involves taking a simple random sample with replacement.</span>
+<span id="cb15-101"><a href="#cb15-101" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>New samples must be the same size as the original sample. We need to accurately model the variability of our estimates.</span>
+<span id="cb15-102"><a href="#cb15-102" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-103"><a href="#cb15-103" aria-hidden="true" tabindex="-1"></a>::: {.callout-warning collapse=\"true\"}</span>
+<span id="cb15-104"><a href="#cb15-104" aria-hidden="true" tabindex="-1"></a><span class="fu">### Why must we resample *with replacement*?</span></span>
+<span id="cb15-105"><a href="#cb15-105" aria-hidden="true" tabindex="-1"></a>Given an original sample of size $n$, we want a resample that has the same size $n$ as the original. Sampling *without* replacement will give us the original sample with shuffled rows. Hence, when we calculate summary statistics like the average, our sample *without* replacement will always have the same average as the original sample, defeating the purpose of a bootstrap.</span>
+<span id="cb15-106"><a href="#cb15-106" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb15-107"><a href="#cb15-107" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-108"><a href="#cb15-108" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb15-109"><a href="#cb15-109" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/bootstrap.png" alt='y_hat' width='700'&gt;</span>
+<span id="cb15-110"><a href="#cb15-110" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb15-111"><a href="#cb15-111" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-112"><a href="#cb15-112" aria-hidden="true" tabindex="-1"></a>Bootstrap resampling is a technique for estimating the sampling distribution of an estimator. To execute it, we can follow the pseudocode below:</span>
+<span id="cb15-113"><a href="#cb15-113" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-114"><a href="#cb15-114" aria-hidden="true" tabindex="-1"></a><span class="in">collect a random sample of size n (called the bootstrap population)</span></span>
+<span id="cb15-115"><a href="#cb15-115" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-116"><a href="#cb15-116" aria-hidden="true" tabindex="-1"></a><span class="in">initiate a list of estimates</span></span>
+<span id="cb15-117"><a href="#cb15-117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-118"><a href="#cb15-118" aria-hidden="true" tabindex="-1"></a><span class="in">repeat 10,000 times:</span></span>
+<span id="cb15-119"><a href="#cb15-119" aria-hidden="true" tabindex="-1"></a><span class="in">    resample with replacement from the bootstrap population</span></span>
+<span id="cb15-120"><a href="#cb15-120" aria-hidden="true" tabindex="-1"></a><span class="in">    apply estimator f to the resample</span></span>
+<span id="cb15-121"><a href="#cb15-121" aria-hidden="true" tabindex="-1"></a><span class="in">    store in list</span></span>
+<span id="cb15-122"><a href="#cb15-122" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-123"><a href="#cb15-123" aria-hidden="true" tabindex="-1"></a><span class="in">list of estimates is the bootstrapped sampling distribution of f</span></span>
+<span id="cb15-124"><a href="#cb15-124" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-125"><a href="#cb15-125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-126"><a href="#cb15-126" aria-hidden="true" tabindex="-1"></a>How well does bootstrapping actually represent our population? The bootstrapped sampling distribution of an estimator does not exactly match the sampling distribution of that estimator, but it is often close. Similarly, the variance of the bootstrapped distribution is often close to the true variance of the estimator. The example below displays the results of different bootstraps from a *known* population using a sample size of $n=50$.</span>
+<span id="cb15-127"><a href="#cb15-127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-128"><a href="#cb15-128" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb15-129"><a href="#cb15-129" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/bootstrapped_samples.png" alt='y_hat' width='600'&gt;</span>
+<span id="cb15-130"><a href="#cb15-130" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb15-131"><a href="#cb15-131" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-132"><a href="#cb15-132" aria-hidden="true" tabindex="-1"></a>In the real world, we don't know the population distribution. The center of the bootstrapped distribution is the estimator applied to our original sample, so we have no way of understanding the estimator's true expected value; the center and spread of our bootstrap are *approximations*. The quality of our bootstrapped distribution also depends on the quality of our original sample. If our original sample was not representative of the population (like Sample 5 in the image above), then the bootstrap is next to useless. In general, bootstrapping works better for *large samples*, when the population distribution is *not heavily skewed* (no outliers), and when the estimator is *“low variance”* (insensitive to extreme values).</span>
+<span id="cb15-133"><a href="#cb15-133" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-134"><a href="#cb15-134" aria-hidden="true" tabindex="-1"></a><span class="co">&lt;!-- #### </span><span class="al">TODO</span><span class="co">: Good to include this example but make sure to integrate well with the following example and ensure it flows. Following example is explained under the assumption that people haven't seen bootstrapping example before.</span></span>
+<span id="cb15-135"><a href="#cb15-135" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-136"><a href="#cb15-136" aria-hidden="true" tabindex="-1"></a><span class="al">###</span><span class="co"> Simple Bootstrap Example</span></span>
+<span id="cb15-137"><a href="#cb15-137" aria-hidden="true" tabindex="-1"></a><span class="co">To get a better idea of how bootstrapping works in practice, let's walk through a simple example of bootstrapping to estimate the relationship between miles per gallon and the weight of a vehicle.</span></span>
+<span id="cb15-138"><a href="#cb15-138" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-139"><a href="#cb15-139" aria-hidden="true" tabindex="-1"></a><span class="co">Suppose we collected a sample of 20 cars from a population. For the purposes of this demo, we will assume that the seaborn dataset represents the entire population. The following is a visualization of our sample:</span></span>
+<span id="cb15-140"><a href="#cb15-140" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-141"><a href="#cb15-141" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb15-142"><a href="#cb15-142" aria-hidden="true" tabindex="-1"></a><span class="co">import numpy as np</span></span>
+<span id="cb15-143"><a href="#cb15-143" aria-hidden="true" tabindex="-1"></a><span class="co">import pandas as pd</span></span>
+<span id="cb15-144"><a href="#cb15-144" aria-hidden="true" tabindex="-1"></a><span class="co">import plotly.express as px</span></span>
+<span id="cb15-145"><a href="#cb15-145" aria-hidden="true" tabindex="-1"></a><span class="co">import sklearn.linear_model as lm</span></span>
+<span id="cb15-146"><a href="#cb15-146" aria-hidden="true" tabindex="-1"></a><span class="co">import seaborn as sns</span></span>
+<span id="cb15-147"><a href="#cb15-147" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-148"><a href="#cb15-148" aria-hidden="true" tabindex="-1"></a><span class="co">np.random.seed(42)</span></span>
+<span id="cb15-149"><a href="#cb15-149" aria-hidden="true" tabindex="-1"></a><span class="co">mpg_sample = sns.load_dataset('mpg').sample(20)</span></span>
+<span id="cb15-150"><a href="#cb15-150" aria-hidden="true" tabindex="-1"></a><span class="co">px.scatter(mpg_sample, x='weight', y='mpg', trendline='ols', width=800)</span></span>
+<span id="cb15-151"><a href="#cb15-151" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-152"><a href="#cb15-152" aria-hidden="true" tabindex="-1"></a><span class="co">Fitting a linear model, we get an estimate of the slope:</span></span>
+<span id="cb15-153"><a href="#cb15-153" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-154"><a href="#cb15-154" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb15-155"><a href="#cb15-155" aria-hidden="true" tabindex="-1"></a><span class="co">model = lm.LinearRegression().fit(mpg_sample[['weight']], mpg_sample['mpg'])</span></span>
+<span id="cb15-156"><a href="#cb15-156" aria-hidden="true" tabindex="-1"></a><span class="co">model.coef_[0] </span></span>
+<span id="cb15-157"><a href="#cb15-157" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-158"><a href="#cb15-158" aria-hidden="true" tabindex="-1"></a><span class="co">#### Bootstrap Implementation</span></span>
+<span id="cb15-159"><a href="#cb15-159" aria-hidden="true" tabindex="-1"></a><span class="co">We can use bootstrapping to estimate the distribution of that coefficient. Here we construct a bootstrap function that takes an estimator function and uses that function to construct many bootstrap estimates of the slope.</span></span>
+<span id="cb15-160"><a href="#cb15-160" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-161"><a href="#cb15-161" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb15-162"><a href="#cb15-162" aria-hidden="true" tabindex="-1"></a><span class="co">def estimator(sample):</span></span>
+<span id="cb15-163"><a href="#cb15-163" aria-hidden="true" tabindex="-1"></a><span class="co">    model = lm.LinearRegression().fit(sample[['weight']], sample['mpg'])</span></span>
+<span id="cb15-164"><a href="#cb15-164" aria-hidden="true" tabindex="-1"></a><span class="co">    return model.coef_[0]</span></span>
+<span id="cb15-165"><a href="#cb15-165" aria-hidden="true" tabindex="-1"></a><span class="co">    </span></span>
+<span id="cb15-166"><a href="#cb15-166" aria-hidden="true" tabindex="-1"></a><span class="co">The code below uses `df.sample` [(documentation)](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html) to generate a bootstrap sample that is the same size as the original sample.</span></span>
+<span id="cb15-167"><a href="#cb15-167" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-168"><a href="#cb15-168" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb15-169"><a href="#cb15-169" aria-hidden="true" tabindex="-1"></a><span class="co">def bootstrap(sample, statistic, num_repetitions):</span></span>
+<span id="cb15-170"><a href="#cb15-170" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
+<span id="cb15-171"><a href="#cb15-171" aria-hidden="true" tabindex="-1"></a><span class="co">    Returns the statistic computed on a num_repetitions  </span></span>
+<span id="cb15-172"><a href="#cb15-172" aria-hidden="true" tabindex="-1"></a><span class="co">    bootstrap samples from sample.</span></span>
+<span id="cb15-173"><a href="#cb15-173" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
+<span id="cb15-174"><a href="#cb15-174" aria-hidden="true" tabindex="-1"></a><span class="co">    stats = []</span></span>
+<span id="cb15-175"><a href="#cb15-175" aria-hidden="true" tabindex="-1"></a><span class="co">    for i in np.arange(num_repetitions):</span></span>
+<span id="cb15-176"><a href="#cb15-176" aria-hidden="true" tabindex="-1"></a><span class="co">        # Step 1: Sample the Sample</span></span>
+<span id="cb15-177"><a href="#cb15-177" aria-hidden="true" tabindex="-1"></a><span class="co">        bootstrap_sample = sample.sample(frac=1, replace=True)</span></span>
+<span id="cb15-178"><a href="#cb15-178" aria-hidden="true" tabindex="-1"></a><span class="co">        # Step 2: compute statistics on the sample of the sample</span></span>
+<span id="cb15-179"><a href="#cb15-179" aria-hidden="true" tabindex="-1"></a><span class="co">        bootstrap_stat = statistic(bootstrap_sample)</span></span>
+<span id="cb15-180"><a href="#cb15-180" aria-hidden="true" tabindex="-1"></a><span class="co">        # Accumulate the statistics</span></span>
+<span id="cb15-181"><a href="#cb15-181" aria-hidden="true" tabindex="-1"></a><span class="co">        stats.append(bootstrap_stat)</span></span>
+<span id="cb15-182"><a href="#cb15-182" aria-hidden="true" tabindex="-1"></a><span class="co">    return stats    </span></span>
+<span id="cb15-183"><a href="#cb15-183" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-184"><a href="#cb15-184" aria-hidden="true" tabindex="-1"></a><span class="co">After constructing many bootstrap slope estimates (in this case, 10,000), we can visualize the distribution of these estimates.</span></span>
+<span id="cb15-185"><a href="#cb15-185" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-186"><a href="#cb15-186" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb15-187"><a href="#cb15-187" aria-hidden="true" tabindex="-1"></a><span class="co">#Construct 10,000 bootstrap slope estimates</span></span>
+<span id="cb15-188"><a href="#cb15-188" aria-hidden="true" tabindex="-1"></a><span class="co">bs_thetas = bootstrap(mpg_sample, estimator, 10000)</span></span>
+<span id="cb15-189"><a href="#cb15-189" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-190"><a href="#cb15-190" aria-hidden="true" tabindex="-1"></a><span class="co">#Visualize the distribution of these estimates</span></span>
+<span id="cb15-191"><a href="#cb15-191" aria-hidden="true" tabindex="-1"></a><span class="co">px.histogram(bs_thetas, title='Bootstrap Distribution of the Slope', </span></span>
+<span id="cb15-192"><a href="#cb15-192" aria-hidden="true" tabindex="-1"></a><span class="co">             width=800)</span></span>
+<span id="cb15-193"><a href="#cb15-193" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-194"><a href="#cb15-194" aria-hidden="true" tabindex="-1"></a><span class="co">#### Computing a Bootstrap CI</span></span>
+<span id="cb15-195"><a href="#cb15-195" aria-hidden="true" tabindex="-1"></a><span class="co">We can now compute the confidence interval for the slopes using the percentiles of the empirical distribution. Here, we are looking for a 95% confidence interval, so we want values at the 2.5 and 97.5 percentiles of the bootstrap samples to be the bounds of our interval. To find the interval, we can use the function defined below:</span></span>
+<span id="cb15-196"><a href="#cb15-196" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-197"><a href="#cb15-197" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb15-198"><a href="#cb15-198" aria-hidden="true" tabindex="-1"></a><span class="co">def bootstrap_ci(bootstrap_samples, confidence_level=95):</span></span>
+<span id="cb15-199"><a href="#cb15-199" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
+<span id="cb15-200"><a href="#cb15-200" aria-hidden="true" tabindex="-1"></a><span class="co">    Returns the confidence interval for the bootstrap samples.</span></span>
+<span id="cb15-201"><a href="#cb15-201" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
+<span id="cb15-202"><a href="#cb15-202" aria-hidden="true" tabindex="-1"></a><span class="co">    lower_percentile = (100 - confidence_level) / 2</span></span>
+<span id="cb15-203"><a href="#cb15-203" aria-hidden="true" tabindex="-1"></a><span class="co">    upper_percentile = 100 - lower_percentile</span></span>
+<span id="cb15-204"><a href="#cb15-204" aria-hidden="true" tabindex="-1"></a><span class="co">    return np.percentile(bootstrap_samples, [lower_percentile, upper_percentile])</span></span>
+<span id="cb15-205"><a href="#cb15-205" aria-hidden="true" tabindex="-1"></a><span class="co">print(bootstrap_ci(bs_thetas))</span></span>
+<span id="cb15-206"><a href="#cb15-206" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-207"><a href="#cb15-207" aria-hidden="true" tabindex="-1"></a><span class="co">#### Comparing to the Population CIs</span></span>
+<span id="cb15-208"><a href="#cb15-208" aria-hidden="true" tabindex="-1"></a><span class="co">In practice, you don't have access to the population. In this example, we took a sample from a larger dataset that we can treat as the population. Let's compare our results to what they would be if we had resampled from the larger dataset. Here is the 95% confidence interval for the slope when sampling 10,000 times from the entire dataset:</span></span>
+<span id="cb15-209"><a href="#cb15-209" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-210"><a href="#cb15-210" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb15-211"><a href="#cb15-211" aria-hidden="true" tabindex="-1"></a><span class="co">mpg_pop = sns.load_dataset('mpg')</span></span>
+<span id="cb15-212"><a href="#cb15-212" aria-hidden="true" tabindex="-1"></a><span class="co">theta_est = [estimator(mpg_pop.sample(20)) for i in range(10000)]</span></span>
+<span id="cb15-213"><a href="#cb15-213" aria-hidden="true" tabindex="-1"></a><span class="co">print(bootstrap_ci(theta_est))</span></span>
+<span id="cb15-214"><a href="#cb15-214" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-215"><a href="#cb15-215" aria-hidden="true" tabindex="-1"></a><span class="co">Visualizing the two distributions:</span></span>
+<span id="cb15-216"><a href="#cb15-216" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb15-217"><a href="#cb15-217" aria-hidden="true" tabindex="-1"></a><span class="co">thetas = pd.DataFrame({"bs_thetas": bs_thetas, "thetas": theta_est})</span></span>
+<span id="cb15-218"><a href="#cb15-218" aria-hidden="true" tabindex="-1"></a><span class="co">px.histogram(thetas.melt(), x='value', facet_row='variable', </span></span>
+<span id="cb15-219"><a href="#cb15-219" aria-hidden="true" tabindex="-1"></a><span class="co">             title='Distribution of the Slope', width=800)</span></span>
+<span id="cb15-220"><a href="#cb15-220" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-221"><a href="#cb15-221" aria-hidden="true" tabindex="-1"></a><span class="co">Although our bootstrapped sample distribution does not exactly match the sampling distribution of the population, we can see that it is relatively close. This demonstrates the benefit of bootstrapping —— without knowing the actual population distribution, we can still roughly approximate the true slope for the model by using only a single random sample of 20 cars.</span></span>
+<span id="cb15-222"><a href="#cb15-222" aria-hidden="true" tabindex="-1"></a><span class="co">--&gt;</span></span>
+<span id="cb15-223"><a href="#cb15-223" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-224"><a href="#cb15-224" aria-hidden="true" tabindex="-1"></a>Although our bootstrapped sample distribution does not exactly match the sampling distribution of the population, we can see that it is relatively close. This demonstrates the benefit of bootstrapping —— without knowing the actual population distribution, we can still roughly approximate the true slope for the model by using only a single random sample of 20 cars.</span>
+<span id="cb15-225"><a href="#cb15-225" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-226"><a href="#cb15-226" aria-hidden="true" tabindex="-1"></a><span class="co">&lt;!-- #### PurpleAir (chose to skip this section because it's too complex for the amount of pedagogical value it adds)</span></span>
+<span id="cb15-227"><a href="#cb15-227" aria-hidden="true" tabindex="-1"></a><span class="co">To show an example of this hypothesis testing process, we'll work with air quality measurement data. There are 2 common sources of air quality information: Air Quality System (AQS) and [PurpleAir sensors](https://www2.purpleair.com/). AQS is seen as the gold standard because it is high quality, well-calibrated, and publicly available. However, it is very expensive, and the sensors are far apart; reports are also delayed due to extensive calibration.  </span></span>
+<span id="cb15-228"><a href="#cb15-228" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-229"><a href="#cb15-229" aria-hidden="true" tabindex="-1"></a><span class="co">On the other hand, PurpleAir (PA) sensors are much cheaper, easier to install, and has denser coverage (measurements are taken every 2 minutes). Unfortunately, its measurements are much less accurate than AQS. </span></span>
+<span id="cb15-230"><a href="#cb15-230" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-231"><a href="#cb15-231" aria-hidden="true" tabindex="-1"></a><span class="co">For this demo, our goal is to use AQS sensor measurements to improve PurpleAir measurements by training a model that adjusts PA measurements based on AQS measurements</span></span>
+<span id="cb15-232"><a href="#cb15-232" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-233"><a href="#cb15-233" aria-hidden="true" tabindex="-1"></a><span class="co">$$PA \approx \theta_0 + \theta_1 AQS$$</span></span>
+<span id="cb15-234"><a href="#cb15-234" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-235"><a href="#cb15-235" aria-hidden="true" tabindex="-1"></a><span class="co">Using this approximation, we'll invert the model to predict the true air quality from PA measurements</span></span>
+<span id="cb15-236"><a href="#cb15-236" aria-hidden="true" tabindex="-1"></a><span class="co">$$ \text{True Air Quality } \approx -\frac{\theta_0}{\theta_1} + \frac{1}{\theta_1} PA$$</span></span>
+<span id="cb15-237"><a href="#cb15-237" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-238"><a href="#cb15-238" aria-hidden="true" tabindex="-1"></a><span class="co">::: {.callout-tip collapse="false"}</span></span>
+<span id="cb15-239"><a href="#cb15-239" aria-hidden="true" tabindex="-1"></a><span class="al">###</span><span class="co"> Inverse Model Derivation </span></span>
+<span id="cb15-240"><a href="#cb15-240" aria-hidden="true" tabindex="-1"></a><span class="co">Intuitively, AQS measurements are very accurate, so we can treat AQS as the true air quality: </span></span>
+<span id="cb15-241"><a href="#cb15-241" aria-hidden="true" tabindex="-1"></a><span class="co">$AQS = \text{True Air Quality}$</span></span>
+<span id="cb15-242"><a href="#cb15-242" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-243"><a href="#cb15-243" aria-hidden="true" tabindex="-1"></a><span class="co">$$</span></span>
+<span id="cb15-244"><a href="#cb15-244" aria-hidden="true" tabindex="-1"></a><span class="co">\begin{align}</span></span>
+<span id="cb15-245"><a href="#cb15-245" aria-hidden="true" tabindex="-1"></a><span class="co">PA &amp;\approx \theta_0 + \theta_1 AQS \\</span></span>
+<span id="cb15-246"><a href="#cb15-246" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;\approx \theta_0 + \theta_1 \text{True Air Quality} \\</span></span>
+<span id="cb15-247"><a href="#cb15-247" aria-hidden="true" tabindex="-1"></a><span class="co">PA - \theta_0 &amp;\approx + \theta_1 \text{True Air Quality} \\</span></span>
+<span id="cb15-248"><a href="#cb15-248" aria-hidden="true" tabindex="-1"></a><span class="co">\frac{PA - \theta_0}{\theta_1} &amp;\approx \text{True Air Quality} \\</span></span>
+<span id="cb15-249"><a href="#cb15-249" aria-hidden="true" tabindex="-1"></a><span class="co">\text{True Air Quality } &amp;\approx -\frac{\theta_0}{\theta_1} + \frac{1}{\theta_1} PA </span></span>
+<span id="cb15-250"><a href="#cb15-250" aria-hidden="true" tabindex="-1"></a><span class="co">\end{align}</span></span>
+<span id="cb15-251"><a href="#cb15-251" aria-hidden="true" tabindex="-1"></a><span class="co">$$</span></span>
+<span id="cb15-252"><a href="#cb15-252" aria-hidden="true" tabindex="-1"></a><span class="co">:::</span></span>
+<span id="cb15-253"><a href="#cb15-253" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-254"><a href="#cb15-254" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb15-255"><a href="#cb15-255" aria-hidden="true" tabindex="-1"></a><span class="co">import numpy as np</span></span>
+<span id="cb15-256"><a href="#cb15-256" aria-hidden="true" tabindex="-1"></a><span class="co">import pandas as pd</span></span>
+<span id="cb15-257"><a href="#cb15-257" aria-hidden="true" tabindex="-1"></a><span class="co">import matplotlib</span></span>
+<span id="cb15-258"><a href="#cb15-258" aria-hidden="true" tabindex="-1"></a><span class="co">import matplotlib.pyplot as plt</span></span>
+<span id="cb15-259"><a href="#cb15-259" aria-hidden="true" tabindex="-1"></a><span class="co">import seaborn as sns</span></span>
+<span id="cb15-260"><a href="#cb15-260" aria-hidden="true" tabindex="-1"></a><span class="co">import sklearn.linear_model as lm</span></span>
+<span id="cb15-261"><a href="#cb15-261" aria-hidden="true" tabindex="-1"></a><span class="co">from sklearn.linear_model import LinearRegression</span></span>
+<span id="cb15-262"><a href="#cb15-262" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-263"><a href="#cb15-263" aria-hidden="true" tabindex="-1"></a><span class="co"># big font helper</span></span>
+<span id="cb15-264"><a href="#cb15-264" aria-hidden="true" tabindex="-1"></a><span class="co">def adjust_fontsize(size=None):</span></span>
+<span id="cb15-265"><a href="#cb15-265" aria-hidden="true" tabindex="-1"></a><span class="co">    SMALL_SIZE = 8</span></span>
+<span id="cb15-266"><a href="#cb15-266" aria-hidden="true" tabindex="-1"></a><span class="co">    MEDIUM_SIZE = 10</span></span>
+<span id="cb15-267"><a href="#cb15-267" aria-hidden="true" tabindex="-1"></a><span class="co">    BIGGER_SIZE = 12</span></span>
+<span id="cb15-268"><a href="#cb15-268" aria-hidden="true" tabindex="-1"></a><span class="co">    if size != None:</span></span>
+<span id="cb15-269"><a href="#cb15-269" aria-hidden="true" tabindex="-1"></a><span class="co">        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size</span></span>
+<span id="cb15-270"><a href="#cb15-270" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-271"><a href="#cb15-271" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes</span></span>
+<span id="cb15-272"><a href="#cb15-272" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title</span></span>
+<span id="cb15-273"><a href="#cb15-273" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels</span></span>
+<span id="cb15-274"><a href="#cb15-274" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels</span></span>
+<span id="cb15-275"><a href="#cb15-275" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels</span></span>
+<span id="cb15-276"><a href="#cb15-276" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize</span></span>
+<span id="cb15-277"><a href="#cb15-277" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title</span></span>
+<span id="cb15-278"><a href="#cb15-278" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-279"><a href="#cb15-279" aria-hidden="true" tabindex="-1"></a><span class="co">plt.style.use('fivethirtyeight')</span></span>
+<span id="cb15-280"><a href="#cb15-280" aria-hidden="true" tabindex="-1"></a><span class="co">sns.set_context("talk")</span></span>
+<span id="cb15-281"><a href="#cb15-281" aria-hidden="true" tabindex="-1"></a><span class="co">sns.set_theme()</span></span>
+<span id="cb15-282"><a href="#cb15-282" aria-hidden="true" tabindex="-1"></a><span class="co">#plt.style.use('default') # revert style to default mpl</span></span>
+<span id="cb15-283"><a href="#cb15-283" aria-hidden="true" tabindex="-1"></a><span class="co">adjust_fontsize(size=20)</span></span>
+<span id="cb15-284"><a href="#cb15-284" aria-hidden="true" tabindex="-1"></a><span class="co">%matplotlib inline</span></span>
+<span id="cb15-285"><a href="#cb15-285" aria-hidden="true" tabindex="-1"></a><span class="co">csv_file = 'data/Full24hrdataset.csv'</span></span>
+<span id="cb15-286"><a href="#cb15-286" aria-hidden="true" tabindex="-1"></a><span class="co">usecols = ['Date', 'ID', 'region', 'PM25FM', 'PM25cf1', 'TempC', 'RH', 'Dewpoint']</span></span>
+<span id="cb15-287"><a href="#cb15-287" aria-hidden="true" tabindex="-1"></a><span class="co">full_df = (pd.read_csv(csv_file, usecols=usecols, parse_dates=['Date'])</span></span>
+<span id="cb15-288"><a href="#cb15-288" aria-hidden="true" tabindex="-1"></a><span class="co">        .dropna())</span></span>
+<span id="cb15-289"><a href="#cb15-289" aria-hidden="true" tabindex="-1"></a><span class="co">full_df.columns = ['date', 'id', 'region', 'pm25aqs', 'pm25pa', 'temp', 'rh', 'dew']</span></span>
+<span id="cb15-290"><a href="#cb15-290" aria-hidden="true" tabindex="-1"></a><span class="co">full_df = full_df.loc[(full_df['pm25aqs'] &lt; 50)]</span></span>
+<span id="cb15-291"><a href="#cb15-291" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-292"><a href="#cb15-292" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-293"><a href="#cb15-293" aria-hidden="true" tabindex="-1"></a><span class="co">bad_dates = ['2019-08-21', '2019-08-22', '2019-09-24']</span></span>
+<span id="cb15-294"><a href="#cb15-294" aria-hidden="true" tabindex="-1"></a><span class="co">GA = full_df.loc[(full_df['id'] == 'GA1') &amp; (~full_df['date'].isin(bad_dates)) , :]</span></span>
+<span id="cb15-295"><a href="#cb15-295" aria-hidden="true" tabindex="-1"></a><span class="co">AQS, PA = GA[['pm25aqs']], GA['pm25pa']</span></span>
+<span id="cb15-296"><a href="#cb15-296" aria-hidden="true" tabindex="-1"></a><span class="co">AQS.head()</span></span>
+<span id="cb15-297"><a href="#cb15-297" aria-hidden="true" tabindex="-1"></a><span class="co">pd.DataFrame(PA).head()</span></span>
+<span id="cb15-298"><a href="#cb15-298" aria-hidden="true" tabindex="-1"></a><span class="co">--&gt;</span></span>
+<span id="cb15-299"><a href="#cb15-299" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-300"><a href="#cb15-300" aria-hidden="true" tabindex="-1"></a><span class="fu">## Collinearity</span></span>
+<span id="cb15-301"><a href="#cb15-301" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-302"><a href="#cb15-302" aria-hidden="true" tabindex="-1"></a><span class="fu">### Hypothesis Testing Through Bootstrap: Snowy Plover Demo</span></span>
+<span id="cb15-303"><a href="#cb15-303" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-304"><a href="#cb15-304" aria-hidden="true" tabindex="-1"></a>We can conduct the hypothesis testing described earlier through **bootstrapping** (this equivalence can be proven through the <span class="co">[</span><span class="ot">duality argument</span><span class="co">](https://stats.stackexchange.com/questions/179902/confidence-interval-p-value-duality-vs-frequentist-interpretation-of-cis)</span>, which is out of scope for this class). We use bootstrapping to compute approximate 95% confidence intervals for each $\theta_i$. If the interval doesn't contain 0, we reject the null hypothesis at the p=5% level. Otherwise, the data is consistent with the null, as the true parameter *could possibly* be 0.</span>
+<span id="cb15-305"><a href="#cb15-305" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-306"><a href="#cb15-306" aria-hidden="true" tabindex="-1"></a>To show an example of this hypothesis testing process, we'll work with the <span class="co">[</span><span class="ot">snowy plover</span><span class="co">](https://www.audubon.org/field-guide/bird/snowy-plover)</span> dataset throughout this section. The data are about the eggs and newly hatched chicks of the Snowy Plover. The data were collected at the Point Reyes National Seashore by a former <span class="co">[</span><span class="ot">student at Berkeley</span><span class="co">](https://openlibrary.org/books/OL2038693M/BLSS_the_Berkeley_interactive_statistical_system)</span>. Here's a <span class="co">[</span><span class="ot">parent bird and some eggs</span><span class="co">](http://cescos.fau.edu/jay/eps/articles/snowyplover.html)</span>.</span>
+<span id="cb15-307"><a href="#cb15-307" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-308"><a href="#cb15-308" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb15-309"><a href="#cb15-309" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/plover_eggs.jpg" alt='bvt' width='550'&gt;</span>
+<span id="cb15-310"><a href="#cb15-310" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb15-311"><a href="#cb15-311" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-312"><a href="#cb15-312" aria-hidden="true" tabindex="-1"></a>Note that <span class="in">`Egg Length`</span> and <span class="in">`Egg Breadth`</span> (widest diameter) are measured in millimeters, and <span class="in">`Egg Weight`</span> and <span class="in">`Bird Weight`</span> are measured in grams. For reference, a standard paper clip weighs about one gram.</span>
+<span id="cb15-313"><a href="#cb15-313" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-316"><a href="#cb15-316" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb15-317"><a href="#cb15-317" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb15-318"><a href="#cb15-318" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb15-319"><a href="#cb15-319" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb15-320"><a href="#cb15-320" aria-hidden="true" tabindex="-1"></a>eggs <span class="op">=</span> pd.read_csv(<span class="st">"data/snowy_plover.csv"</span>)</span>
+<span id="cb15-321"><a href="#cb15-321" aria-hidden="true" tabindex="-1"></a>eggs.head(<span class="dv">5</span>)</span>
+<span id="cb15-322"><a href="#cb15-322" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-323"><a href="#cb15-323" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-324"><a href="#cb15-324" aria-hidden="true" tabindex="-1"></a>Our goal will be to predict the weight of a newborn plover chick, which we assume follows the true relationship $Y = f_{\theta}(x)$ below.</span>
+<span id="cb15-325"><a href="#cb15-325" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-326"><a href="#cb15-326" aria-hidden="true" tabindex="-1"></a>$$\text{bird<span class="sc">\_</span>weight} = \theta_0 + \theta_1 \text{egg<span class="sc">\_</span>weight} + \theta_2 \text{egg<span class="sc">\_</span>length} + \theta_3 \text{egg<span class="sc">\_</span>breadth} + \epsilon$$</span>
+<span id="cb15-327"><a href="#cb15-327" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-328"><a href="#cb15-328" aria-hidden="true" tabindex="-1"></a>Note that for each $i$, the parameter $\theta_i$ is a fixed number, but it is unobservable. We can only estimate it. The random error $\epsilon$ is also unobservable, but it is assumed to have expectation 0 and be independent and identically distributed across eggs.</span>
+<span id="cb15-329"><a href="#cb15-329" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-330"><a href="#cb15-330" aria-hidden="true" tabindex="-1"></a>Say we wish to determine if the <span class="in">`egg_weight`</span> impacts the <span class="in">`bird_weight`</span> of a chick – we want to infer if $\theta_1$ is equal to 0.</span>
+<span id="cb15-331"><a href="#cb15-331" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-332"><a href="#cb15-332" aria-hidden="true" tabindex="-1"></a>First, we define our hypotheses:</span>
+<span id="cb15-333"><a href="#cb15-333" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-334"><a href="#cb15-334" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Null hypothesis**: the true parameter $\theta_1$ is 0; any variation is due to random chance.</span>
+<span id="cb15-335"><a href="#cb15-335" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Alternative hypothesis**: the true parameter $\theta_1$ is not 0.</span>
+<span id="cb15-336"><a href="#cb15-336" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-337"><a href="#cb15-337" aria-hidden="true" tabindex="-1"></a>Next, we use our data to fit a model $\hat{Y} = f_{\hat{\theta}}(x)$ that approximates the relationship above. This gives us the **observed value** of $\hat{\theta}_1$ from our data.</span>
+<span id="cb15-338"><a href="#cb15-338" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-341"><a href="#cb15-341" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb15-342"><a href="#cb15-342" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb15-343"><a href="#cb15-343" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb15-344"><a href="#cb15-344" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb15-345"><a href="#cb15-345" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb15-346"><a href="#cb15-346" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-347"><a href="#cb15-347" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> eggs[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
+<span id="cb15-348"><a href="#cb15-348" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> eggs[<span class="st">"bird_weight"</span>]</span>
+<span id="cb15-349"><a href="#cb15-349" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-350"><a href="#cb15-350" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
+<span id="cb15-351"><a href="#cb15-351" aria-hidden="true" tabindex="-1"></a>model.fit(X, Y)</span>
+<span id="cb15-352"><a href="#cb15-352" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-353"><a href="#cb15-353" aria-hidden="true" tabindex="-1"></a><span class="co"># This gives an array containing the fitted model parameter estimates</span></span>
+<span id="cb15-354"><a href="#cb15-354" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> model.coef_</span>
+<span id="cb15-355"><a href="#cb15-355" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-356"><a href="#cb15-356" aria-hidden="true" tabindex="-1"></a><span class="co"># Put the parameter estimates in a nice table for viewing</span></span>
+<span id="cb15-357"><a href="#cb15-357" aria-hidden="true" tabindex="-1"></a>display(pd.DataFrame(</span>
+<span id="cb15-358"><a href="#cb15-358" aria-hidden="true" tabindex="-1"></a>  [model.intercept_] <span class="op">+</span> <span class="bu">list</span>(model.coef_),</span>
+<span id="cb15-359"><a href="#cb15-359" aria-hidden="true" tabindex="-1"></a>  columns<span class="op">=</span>[<span class="st">'theta_hat'</span>],</span>
+<span id="cb15-360"><a href="#cb15-360" aria-hidden="true" tabindex="-1"></a>  index<span class="op">=</span>[<span class="st">'intercept'</span>, <span class="st">'egg_weight'</span>, <span class="st">'egg_length'</span>, <span class="st">'egg_breadth'</span>]</span>
+<span id="cb15-361"><a href="#cb15-361" aria-hidden="true" tabindex="-1"></a>))</span>
+<span id="cb15-362"><a href="#cb15-362" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-363"><a href="#cb15-363" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"RMSE"</span>, np.mean((Y <span class="op">-</span> model.predict(X)) <span class="op">**</span> <span class="dv">2</span>))</span>
+<span id="cb15-364"><a href="#cb15-364" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-365"><a href="#cb15-365" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-366"><a href="#cb15-366" aria-hidden="true" tabindex="-1"></a>Our single sample of data gives us the value of $\hat{\theta}_1=0.431$. To get a sense of how this estimate might vary if we were to draw different random samples, we will use <span class="co">[</span><span class="ot">bootstrapping</span><span class="co">](https://inferentialthinking.com/chapters/13/2/Bootstrap.html?)</span>. As a refresher, to construct a bootstrap sample, we will draw a resample from the collected data that:</span>
+<span id="cb15-367"><a href="#cb15-367" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-368"><a href="#cb15-368" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Has the same sample size as the collected data</span>
+<span id="cb15-369"><a href="#cb15-369" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Is drawn with replacement (this ensures that we don't draw the exact same sample every time!)</span>
+<span id="cb15-370"><a href="#cb15-370" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-371"><a href="#cb15-371" aria-hidden="true" tabindex="-1"></a>We draw a bootstrap sample, use this sample to fit a model, and record the result for $\hat{\theta}_1$ on this bootstrapped sample. We then repeat this process many times to generate a **bootstrapped empirical distribution** of $\hat{\theta}_1$. This gives us an estimate of what the true distribution of $\hat{\theta}_1$ across all possible samples might look like.</span>
+<span id="cb15-372"><a href="#cb15-372" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-375"><a href="#cb15-375" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb15-376"><a href="#cb15-376" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb15-377"><a href="#cb15-377" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb15-378"><a href="#cb15-378" aria-hidden="true" tabindex="-1"></a><span class="co"># Set a random seed so you generate the same random sample as staff</span></span>
+<span id="cb15-379"><a href="#cb15-379" aria-hidden="true" tabindex="-1"></a><span class="co"># In the "real world", we wouldn't do this</span></span>
+<span id="cb15-380"><a href="#cb15-380" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb15-381"><a href="#cb15-381" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
+<span id="cb15-382"><a href="#cb15-382" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-383"><a href="#cb15-383" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the sample size of each bootstrap sample</span></span>
+<span id="cb15-384"><a href="#cb15-384" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(eggs)</span>
+<span id="cb15-385"><a href="#cb15-385" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-386"><a href="#cb15-386" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a list to store all the bootstrapped estimates</span></span>
+<span id="cb15-387"><a href="#cb15-387" aria-hidden="true" tabindex="-1"></a>estimates <span class="op">=</span> []</span>
+<span id="cb15-388"><a href="#cb15-388" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-389"><a href="#cb15-389" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate a bootstrap resample from `eggs` and find an estimate for theta_1 using this sample. </span></span>
+<span id="cb15-390"><a href="#cb15-390" aria-hidden="true" tabindex="-1"></a><span class="co"># Repeat 10000 times.</span></span>
+<span id="cb15-391"><a href="#cb15-391" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
+<span id="cb15-392"><a href="#cb15-392" aria-hidden="true" tabindex="-1"></a>    <span class="co"># draw a bootstrap sample</span></span>
+<span id="cb15-393"><a href="#cb15-393" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb15-394"><a href="#cb15-394" aria-hidden="true" tabindex="-1"></a>    X_bootstrap <span class="op">=</span> bootstrap_resample[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
+<span id="cb15-395"><a href="#cb15-395" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap <span class="op">=</span> bootstrap_resample[<span class="st">"bird_weight"</span>]</span>
+<span id="cb15-396"><a href="#cb15-396" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb15-397"><a href="#cb15-397" aria-hidden="true" tabindex="-1"></a>    <span class="co"># use bootstrapped sample to fit a model</span></span>
+<span id="cb15-398"><a href="#cb15-398" aria-hidden="true" tabindex="-1"></a>    bootstrap_model <span class="op">=</span> LinearRegression()</span>
+<span id="cb15-399"><a href="#cb15-399" aria-hidden="true" tabindex="-1"></a>    bootstrap_model.fit(X_bootstrap, Y_bootstrap)</span>
+<span id="cb15-400"><a href="#cb15-400" aria-hidden="true" tabindex="-1"></a>    bootstrap_thetas <span class="op">=</span> bootstrap_model.coef_</span>
+<span id="cb15-401"><a href="#cb15-401" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb15-402"><a href="#cb15-402" aria-hidden="true" tabindex="-1"></a>    <span class="co"># record the result for theta_1</span></span>
+<span id="cb15-403"><a href="#cb15-403" aria-hidden="true" tabindex="-1"></a>    estimates.append(bootstrap_thetas[<span class="dv">0</span>])</span>
+<span id="cb15-404"><a href="#cb15-404" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb15-405"><a href="#cb15-405" aria-hidden="true" tabindex="-1"></a><span class="co"># calculate the 95% confidence interval </span></span>
+<span id="cb15-406"><a href="#cb15-406" aria-hidden="true" tabindex="-1"></a>lower <span class="op">=</span> np.percentile(estimates, <span class="fl">2.5</span>, axis<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb15-407"><a href="#cb15-407" aria-hidden="true" tabindex="-1"></a>upper <span class="op">=</span> np.percentile(estimates, <span class="fl">97.5</span>, axis<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb15-408"><a href="#cb15-408" aria-hidden="true" tabindex="-1"></a>conf_interval <span class="op">=</span> (lower, upper)</span>
+<span id="cb15-409"><a href="#cb15-409" aria-hidden="true" tabindex="-1"></a>conf_interval</span>
+<span id="cb15-410"><a href="#cb15-410" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-411"><a href="#cb15-411" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-412"><a href="#cb15-412" aria-hidden="true" tabindex="-1"></a>Our bootstrapped 95% confidence interval for $\theta_1$ is $<span class="co">[</span><span class="ot">-0.259, 1.103</span><span class="co">]</span>$. Immediately, we can see that 0 *is* indeed contained in this interval – this means that we *cannot* conclude that $\theta_1$ is non-zero! More formally, we fail to reject the null hypothesis (that $\theta_1$ is 0) under a 5% p-value cutoff. </span>
+<span id="cb15-413"><a href="#cb15-413" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-414"><a href="#cb15-414" aria-hidden="true" tabindex="-1"></a>We can repeat this process to construct 95% confidence intervals for the other parameters of the model.</span>
+<span id="cb15-415"><a href="#cb15-415" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-418"><a href="#cb15-418" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb15-419"><a href="#cb15-419" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb15-420"><a href="#cb15-420" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
+<span id="cb15-421"><a href="#cb15-421" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-422"><a href="#cb15-422" aria-hidden="true" tabindex="-1"></a>theta_0_estimates <span class="op">=</span> []</span>
+<span id="cb15-423"><a href="#cb15-423" aria-hidden="true" tabindex="-1"></a>theta_1_estimates <span class="op">=</span> []</span>
+<span id="cb15-424"><a href="#cb15-424" aria-hidden="true" tabindex="-1"></a>theta_2_estimates <span class="op">=</span> []</span>
+<span id="cb15-425"><a href="#cb15-425" aria-hidden="true" tabindex="-1"></a>theta_3_estimates <span class="op">=</span> []</span>
+<span id="cb15-426"><a href="#cb15-426" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-427"><a href="#cb15-427" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-428"><a href="#cb15-428" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
+<span id="cb15-429"><a href="#cb15-429" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb15-430"><a href="#cb15-430" aria-hidden="true" tabindex="-1"></a>    X_bootstrap <span class="op">=</span> bootstrap_resample[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
+<span id="cb15-431"><a href="#cb15-431" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap <span class="op">=</span> bootstrap_resample[<span class="st">"bird_weight"</span>]</span>
+<span id="cb15-432"><a href="#cb15-432" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb15-433"><a href="#cb15-433" aria-hidden="true" tabindex="-1"></a>    bootstrap_model <span class="op">=</span> LinearRegression()</span>
+<span id="cb15-434"><a href="#cb15-434" aria-hidden="true" tabindex="-1"></a>    bootstrap_model.fit(X_bootstrap, Y_bootstrap)</span>
+<span id="cb15-435"><a href="#cb15-435" aria-hidden="true" tabindex="-1"></a>    bootstrap_theta_0 <span class="op">=</span> bootstrap_model.intercept_</span>
+<span id="cb15-436"><a href="#cb15-436" aria-hidden="true" tabindex="-1"></a>    bootstrap_theta_1, bootstrap_theta_2, bootstrap_theta_3 <span class="op">=</span> bootstrap_model.coef_</span>
+<span id="cb15-437"><a href="#cb15-437" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb15-438"><a href="#cb15-438" aria-hidden="true" tabindex="-1"></a>    theta_0_estimates.append(bootstrap_theta_0)</span>
+<span id="cb15-439"><a href="#cb15-439" aria-hidden="true" tabindex="-1"></a>    theta_1_estimates.append(bootstrap_theta_1)</span>
+<span id="cb15-440"><a href="#cb15-440" aria-hidden="true" tabindex="-1"></a>    theta_2_estimates.append(bootstrap_theta_2)</span>
+<span id="cb15-441"><a href="#cb15-441" aria-hidden="true" tabindex="-1"></a>    theta_3_estimates.append(bootstrap_theta_3)</span>
+<span id="cb15-442"><a href="#cb15-442" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb15-443"><a href="#cb15-443" aria-hidden="true" tabindex="-1"></a>theta_0_lower, theta_0_upper <span class="op">=</span> np.percentile(theta_0_estimates, <span class="fl">2.5</span>), np.percentile(theta_0_estimates, <span class="fl">97.5</span>)</span>
+<span id="cb15-444"><a href="#cb15-444" aria-hidden="true" tabindex="-1"></a>theta_1_lower, theta_1_upper <span class="op">=</span> np.percentile(theta_1_estimates, <span class="fl">2.5</span>), np.percentile(theta_1_estimates, <span class="fl">97.5</span>)</span>
+<span id="cb15-445"><a href="#cb15-445" aria-hidden="true" tabindex="-1"></a>theta_2_lower, theta_2_upper <span class="op">=</span> np.percentile(theta_2_estimates, <span class="fl">2.5</span>), np.percentile(theta_2_estimates, <span class="fl">97.5</span>)</span>
+<span id="cb15-446"><a href="#cb15-446" aria-hidden="true" tabindex="-1"></a>theta_3_lower, theta_3_upper <span class="op">=</span> np.percentile(theta_3_estimates, <span class="fl">2.5</span>), np.percentile(theta_3_estimates, <span class="fl">97.5</span>)</span>
+<span id="cb15-447"><a href="#cb15-447" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-448"><a href="#cb15-448" aria-hidden="true" tabindex="-1"></a><span class="co"># Make a nice table to view results</span></span>
+<span id="cb15-449"><a href="#cb15-449" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"lower"</span>:[theta_0_lower, theta_1_lower, theta_2_lower, theta_3_lower], <span class="st">"upper"</span>:[theta_0_upper, <span class="op">\</span></span>
+<span id="cb15-450"><a href="#cb15-450" aria-hidden="true" tabindex="-1"></a>                theta_1_upper, theta_2_upper, theta_3_upper]}, index<span class="op">=</span>[<span class="st">"theta_0"</span>, <span class="st">"theta_1"</span>, <span class="st">"theta_2"</span>, <span class="st">"theta_3"</span>])</span>
+<span id="cb15-451"><a href="#cb15-451" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-452"><a href="#cb15-452" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-453"><a href="#cb15-453" aria-hidden="true" tabindex="-1"></a>Something's off here. Notice that 0 is included in the 95% confidence interval for *every* parameter of the model. Using the interpretation we outlined above, this would suggest that we can't say for certain that *any* of the input variables impact the response variable! This makes it seem like our model can't make any predictions – and yet, each model we fit in our bootstrap experiment above could very much make predictions of $Y$. </span>
+<span id="cb15-454"><a href="#cb15-454" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-455"><a href="#cb15-455" aria-hidden="true" tabindex="-1"></a>How can we explain this result? Think back to how we first interpreted the parameters of a linear model. We treated each $\theta_i$ as a slope, where a unit increase in $x_i$ leads to a $\theta_i$ increase in $Y$, **if all other variables are held constant**. It turns out that this last assumption is very important. If variables in our model are somehow related to one another, then it might not be possible to have a change in one of them while holding the others constant. This means that our interpretation framework is no longer valid! In the models we fit above, we incorporated <span class="in">`egg_length`</span>, <span class="in">`egg_breadth`</span>, and <span class="in">`egg_weight`</span> as input variables. These variables are very likely related to one another – an egg with large <span class="in">`egg_length`</span> and <span class="in">`egg_breadth`</span> will likely be heavy in <span class="in">`egg_weight`</span>. This means that the model parameters cannot be meaningfully interpreted as slopes. </span>
+<span id="cb15-456"><a href="#cb15-456" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-457"><a href="#cb15-457" aria-hidden="true" tabindex="-1"></a>To support this conclusion, we can visualize the relationships between our feature variables. Notice the strong positive association between the features.</span>
+<span id="cb15-458"><a href="#cb15-458" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-461"><a href="#cb15-461" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb15-462"><a href="#cb15-462" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb15-463"><a href="#cb15-463" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb15-464"><a href="#cb15-464" aria-hidden="true" tabindex="-1"></a>sns.pairplot(eggs[[<span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>, <span class="st">"egg_weight"</span>, <span class="st">'bird_weight'</span>]])<span class="op">;</span></span>
+<span id="cb15-465"><a href="#cb15-465" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-466"><a href="#cb15-466" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-467"><a href="#cb15-467" aria-hidden="true" tabindex="-1"></a>This issue is known as **collinearity**, sometimes also called **multicollinearity**. Collinearity occurs when one feature can be predicted fairly accurately by a linear combination of the other features, which happens when one feature is highly correlated with the others. </span>
+<span id="cb15-468"><a href="#cb15-468" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-469"><a href="#cb15-469" aria-hidden="true" tabindex="-1"></a>Why is collinearity a problem? Its consequences span several aspects of the modeling process:</span>
+<span id="cb15-470"><a href="#cb15-470" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-471"><a href="#cb15-471" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Inference**: Slopes can't be interpreted for an inference task.</span>
+<span id="cb15-472"><a href="#cb15-472" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Model Variance**: If features strongly influence one another, even small changes in the sampled data can lead to large changes in the estimated slopes.</span>
+<span id="cb15-473"><a href="#cb15-473" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Unique Solution**: If one feature is a linear combination of the other features, the design matrix will not be full rank, and $\mathbb{X}^{\top}\mathbb{X}$ is not invertible. This means that least squares does not have a unique solution. See <span class="co">[</span><span class="ot">this section</span><span class="co">](https://ds100.org/course-notes/ols/ols.html#bonus-uniqueness-of-the-solution)</span> of Course Note 12 for more on this.</span>
+<span id="cb15-474"><a href="#cb15-474" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-475"><a href="#cb15-475" aria-hidden="true" tabindex="-1"></a>The take-home point is that we need to be careful with what features we select for modeling. If two features likely encode similar information, it is often a good idea to choose only one of them as an input variable.</span>
+<span id="cb15-476"><a href="#cb15-476" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-477"><a href="#cb15-477" aria-hidden="true" tabindex="-1"></a><span class="fu">### A Simpler Model</span></span>
+<span id="cb15-478"><a href="#cb15-478" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-479"><a href="#cb15-479" aria-hidden="true" tabindex="-1"></a>Let us now consider a more interpretable model: we instead assume a true relationship using only egg weight:</span>
+<span id="cb15-480"><a href="#cb15-480" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-481"><a href="#cb15-481" aria-hidden="true" tabindex="-1"></a>$$f_\theta(x) = \theta_0 + \theta_1 \text{egg<span class="sc">\_</span>weight} + \epsilon$$</span>
+<span id="cb15-482"><a href="#cb15-482" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-485"><a href="#cb15-485" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb15-486"><a href="#cb15-486" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb15-487"><a href="#cb15-487" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb15-488"><a href="#cb15-488" aria-hidden="true" tabindex="-1"></a>X_int <span class="op">=</span> eggs[[<span class="st">"egg_weight"</span>]]</span>
+<span id="cb15-489"><a href="#cb15-489" aria-hidden="true" tabindex="-1"></a>Y_int <span class="op">=</span> eggs[<span class="st">"bird_weight"</span>]</span>
+<span id="cb15-490"><a href="#cb15-490" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-491"><a href="#cb15-491" aria-hidden="true" tabindex="-1"></a>model_int <span class="op">=</span> LinearRegression()</span>
+<span id="cb15-492"><a href="#cb15-492" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-493"><a href="#cb15-493" aria-hidden="true" tabindex="-1"></a>model_int.fit(X_int, Y_int)</span>
+<span id="cb15-494"><a href="#cb15-494" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-495"><a href="#cb15-495" aria-hidden="true" tabindex="-1"></a><span class="co"># This gives an array containing the fitted model parameter estimates</span></span>
+<span id="cb15-496"><a href="#cb15-496" aria-hidden="true" tabindex="-1"></a>thetas_int <span class="op">=</span> model_int.coef_</span>
+<span id="cb15-497"><a href="#cb15-497" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-498"><a href="#cb15-498" aria-hidden="true" tabindex="-1"></a><span class="co"># Put the parameter estimates in a nice table for viewing</span></span>
+<span id="cb15-499"><a href="#cb15-499" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"theta_hat"</span>:[model_int.intercept_, thetas_int[<span class="dv">0</span>]]}, index<span class="op">=</span>[<span class="st">"theta_0"</span>, <span class="st">"theta_1"</span>])</span>
+<span id="cb15-500"><a href="#cb15-500" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-501"><a href="#cb15-501" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-504"><a href="#cb15-504" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb15-505"><a href="#cb15-505" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb15-506"><a href="#cb15-506" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb15-507"><a href="#cb15-507" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb15-508"><a href="#cb15-508" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-509"><a href="#cb15-509" aria-hidden="true" tabindex="-1"></a><span class="co"># Set a random seed so you generate the same random sample as staff</span></span>
+<span id="cb15-510"><a href="#cb15-510" aria-hidden="true" tabindex="-1"></a><span class="co"># In the "real world", we wouldn't do this</span></span>
+<span id="cb15-511"><a href="#cb15-511" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
+<span id="cb15-512"><a href="#cb15-512" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-513"><a href="#cb15-513" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the sample size of each bootstrap sample</span></span>
+<span id="cb15-514"><a href="#cb15-514" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(eggs)</span>
+<span id="cb15-515"><a href="#cb15-515" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-516"><a href="#cb15-516" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a list to store all the bootstrapped estimates</span></span>
+<span id="cb15-517"><a href="#cb15-517" aria-hidden="true" tabindex="-1"></a>estimates_int <span class="op">=</span> []</span>
+<span id="cb15-518"><a href="#cb15-518" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-519"><a href="#cb15-519" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate a bootstrap resample from `eggs` and find an estimate for theta_1 using this sample. </span></span>
+<span id="cb15-520"><a href="#cb15-520" aria-hidden="true" tabindex="-1"></a><span class="co"># Repeat 10000 times.</span></span>
+<span id="cb15-521"><a href="#cb15-521" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
+<span id="cb15-522"><a href="#cb15-522" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample_int <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb15-523"><a href="#cb15-523" aria-hidden="true" tabindex="-1"></a>    X_bootstrap_int <span class="op">=</span> bootstrap_resample_int[[<span class="st">"egg_weight"</span>]]</span>
+<span id="cb15-524"><a href="#cb15-524" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap_int <span class="op">=</span> bootstrap_resample_int[<span class="st">"bird_weight"</span>]</span>
+<span id="cb15-525"><a href="#cb15-525" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb15-526"><a href="#cb15-526" aria-hidden="true" tabindex="-1"></a>    bootstrap_model_int <span class="op">=</span> LinearRegression()</span>
+<span id="cb15-527"><a href="#cb15-527" aria-hidden="true" tabindex="-1"></a>    bootstrap_model_int.fit(X_bootstrap_int, Y_bootstrap_int)</span>
+<span id="cb15-528"><a href="#cb15-528" aria-hidden="true" tabindex="-1"></a>    bootstrap_thetas_int <span class="op">=</span> bootstrap_model_int.coef_</span>
+<span id="cb15-529"><a href="#cb15-529" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb15-530"><a href="#cb15-530" aria-hidden="true" tabindex="-1"></a>    estimates_int.append(bootstrap_thetas_int[<span class="dv">0</span>])</span>
+<span id="cb15-531"><a href="#cb15-531" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-532"><a href="#cb15-532" aria-hidden="true" tabindex="-1"></a>plt.figure(dpi<span class="op">=</span><span class="dv">120</span>)</span>
+<span id="cb15-533"><a href="#cb15-533" aria-hidden="true" tabindex="-1"></a>sns.histplot(estimates_int, stat<span class="op">=</span><span class="st">"density"</span>)</span>
+<span id="cb15-534"><a href="#cb15-534" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r"$\hat{\theta}_1$"</span>)</span>
+<span id="cb15-535"><a href="#cb15-535" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="vs">r"Bootstrapped estimates $\hat{\theta}_1$ Under the Interpretable Model"</span>)<span class="op">;</span></span>
+<span id="cb15-536"><a href="#cb15-536" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-537"><a href="#cb15-537" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-538"><a href="#cb15-538" aria-hidden="true" tabindex="-1"></a>Notice how the interpretable model performs almost as well as our other model:</span>
+<span id="cb15-539"><a href="#cb15-539" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-542"><a href="#cb15-542" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb15-543"><a href="#cb15-543" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb15-544"><a href="#cb15-544" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> mean_squared_error</span>
+<span id="cb15-545"><a href="#cb15-545" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-546"><a href="#cb15-546" aria-hidden="true" tabindex="-1"></a>rmse <span class="op">=</span> mean_squared_error(Y, model.predict(X))</span>
+<span id="cb15-547"><a href="#cb15-547" aria-hidden="true" tabindex="-1"></a>rmse_int <span class="op">=</span> mean_squared_error(Y_int, model_int.predict(X_int))</span>
+<span id="cb15-548"><a href="#cb15-548" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f'RMSE of Original Model: </span><span class="sc">{</span>rmse<span class="sc">}</span><span class="ss">'</span>)</span>
+<span id="cb15-549"><a href="#cb15-549" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f'RMSE of Interpretable Model: </span><span class="sc">{</span>rmse_int<span class="sc">}</span><span class="ss">'</span>)</span>
+<span id="cb15-550"><a href="#cb15-550" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-551"><a href="#cb15-551" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-552"><a href="#cb15-552" aria-hidden="true" tabindex="-1"></a>Yet, the confidence interval for the true parameter $\theta_{1}$ does not contain zero.</span>
+<span id="cb15-553"><a href="#cb15-553" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-556"><a href="#cb15-556" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb15-557"><a href="#cb15-557" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb15-558"><a href="#cb15-558" aria-hidden="true" tabindex="-1"></a>lower_int <span class="op">=</span> np.percentile(estimates_int, <span class="fl">2.5</span>)</span>
+<span id="cb15-559"><a href="#cb15-559" aria-hidden="true" tabindex="-1"></a>upper_int <span class="op">=</span> np.percentile(estimates_int, <span class="fl">97.5</span>)</span>
+<span id="cb15-560"><a href="#cb15-560" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-561"><a href="#cb15-561" aria-hidden="true" tabindex="-1"></a>conf_interval_int <span class="op">=</span> (lower_int, upper_int)</span>
+<span id="cb15-562"><a href="#cb15-562" aria-hidden="true" tabindex="-1"></a>conf_interval_int</span>
+<span id="cb15-563"><a href="#cb15-563" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb15-564"><a href="#cb15-564" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-565"><a href="#cb15-565" aria-hidden="true" tabindex="-1"></a>In retrospect, it’s no surprise that the weight of an egg best predicts the weight of a newly-hatched chick.</span>
+<span id="cb15-566"><a href="#cb15-566" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-567"><a href="#cb15-567" aria-hidden="true" tabindex="-1"></a>A model with highly correlated variables prevents us from interpreting how the variables are related to the prediction.</span>
+<span id="cb15-568"><a href="#cb15-568" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-569"><a href="#cb15-569" aria-hidden="true" tabindex="-1"></a><span class="fu">### Reminder: Assumptions Matter</span></span>
+<span id="cb15-570"><a href="#cb15-570" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-571"><a href="#cb15-571" aria-hidden="true" tabindex="-1"></a>Keep the following in mind:</span>
+<span id="cb15-572"><a href="#cb15-572" aria-hidden="true" tabindex="-1"></a>All inference assumes that the regression model holds.</span>
+<span id="cb15-573"><a href="#cb15-573" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-574"><a href="#cb15-574" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>If the model doesn’t hold, the inference might not be valid.</span>
+<span id="cb15-575"><a href="#cb15-575" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>If the <span class="co">[</span><span class="ot">assumptions of the bootstrap</span><span class="co">](https://inferentialthinking.com/chapters/13/3/Confidence_Intervals.html?highlight=p%20value%20confidence%20interval#care-in-using-the-bootstrap-percentile-method)</span> don’t hold…</span>
+<span id="cb15-576"><a href="#cb15-576" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Sample size n is large</span>
+<span id="cb15-577"><a href="#cb15-577" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Sample is representative of population distribution (drawn i.i.d., unbiased)</span>
+<span id="cb15-578"><a href="#cb15-578" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb15-579"><a href="#cb15-579" aria-hidden="true" tabindex="-1"></a>    …then the results of the bootstrap might not be valid.</span>
+<span id="cb15-580"><a href="#cb15-580" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-581"><a href="#cb15-581" aria-hidden="true" tabindex="-1"></a><span class="fu">## [Bonus Content] </span></span>
+<span id="cb15-582"><a href="#cb15-582" aria-hidden="true" tabindex="-1"></a>Note: the content in this section is out of scope.</span>
+<span id="cb15-583"><a href="#cb15-583" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-584"><a href="#cb15-584" aria-hidden="true" tabindex="-1"></a><span class="co">&lt;!-- </span><span class="al">###</span><span class="co"> Correlation vs. Causation</span></span>
+<span id="cb15-585"><a href="#cb15-585" aria-hidden="true" tabindex="-1"></a><span class="co">Let us consider some questions in an arbitrary regression problem. </span></span>
+<span id="cb15-586"><a href="#cb15-586" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-587"><a href="#cb15-587" aria-hidden="true" tabindex="-1"></a><span class="co">What does $\theta_{j}$ mean in our regression?</span></span>
+<span id="cb15-588"><a href="#cb15-588" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-589"><a href="#cb15-589" aria-hidden="true" tabindex="-1"></a><span class="co">* Holding other variables fixed, how much should our prediction change with $X_{j}$?</span></span>
+<span id="cb15-590"><a href="#cb15-590" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-591"><a href="#cb15-591" aria-hidden="true" tabindex="-1"></a><span class="co">For simple linear regression, this boils down to the correlation coefficient</span></span>
+<span id="cb15-592"><a href="#cb15-592" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-593"><a href="#cb15-593" aria-hidden="true" tabindex="-1"></a><span class="co">* Does having more $x$ predict more $y$ (and by how much)? --&gt;</span></span>
+<span id="cb15-594"><a href="#cb15-594" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-595"><a href="#cb15-595" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-596"><a href="#cb15-596" aria-hidden="true" tabindex="-1"></a><span class="fu">### Prediction vs Causation</span></span>
+<span id="cb15-597"><a href="#cb15-597" aria-hidden="true" tabindex="-1"></a>The difference between correlation/prediction vs. causation is best illustrated through examples. </span>
+<span id="cb15-598"><a href="#cb15-598" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-599"><a href="#cb15-599" aria-hidden="true" tabindex="-1"></a>Some questions about **correlation / prediction** include:</span>
+<span id="cb15-600"><a href="#cb15-600" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-601"><a href="#cb15-601" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Are homes with granite countertops worth more money?</span>
+<span id="cb15-602"><a href="#cb15-602" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Is college GPA higher for students who win a certain scholarship?</span>
+<span id="cb15-603"><a href="#cb15-603" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Are breastfed babies less likely to develop asthma?</span>
+<span id="cb15-604"><a href="#cb15-604" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Do cancer patients given some aggressive treatment have a higher 5-year survival rate?</span>
+<span id="cb15-605"><a href="#cb15-605" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Are people who smoke more likely to get cancer? </span>
+<span id="cb15-606"><a href="#cb15-606" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-607"><a href="#cb15-607" aria-hidden="true" tabindex="-1"></a>While these may sound like causal questions, they are not! Questions about **causality** are about the **effects** of **interventions** (not just passive observation). For example:</span>
+<span id="cb15-608"><a href="#cb15-608" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-609"><a href="#cb15-609" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>How much do granite countertops **raise** the value of a house?</span>
+<span id="cb15-610"><a href="#cb15-610" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Does getting the scholarship **improve** students’ GPAs?</span>
+<span id="cb15-611"><a href="#cb15-611" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Does breastfeeding **protect** babies against asthma?</span>
+<span id="cb15-612"><a href="#cb15-612" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Does the treatment **improve** cancer survival?</span>
+<span id="cb15-613"><a href="#cb15-613" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Does smoking **cause** cancer?</span>
+<span id="cb15-614"><a href="#cb15-614" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-615"><a href="#cb15-615" aria-hidden="true" tabindex="-1"></a>Note, however, that regression coefficients are sometimes called “effects”, which can be deceptive!</span>
+<span id="cb15-616"><a href="#cb15-616" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-617"><a href="#cb15-617" aria-hidden="true" tabindex="-1"></a>When using data alone, **predictive questions** (i.e., are breastfed babies healthier?) can be answered, but **causal questions** (i.e., does breastfeeding improve babies’ health?) cannot. The reason for this is that there are many possible causes for our predictive question. For example, possible explanations for why breastfed babies are healthier on average include:</span>
+<span id="cb15-618"><a href="#cb15-618" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-619"><a href="#cb15-619" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Causal effect:** breastfeeding makes babies healthier</span>
+<span id="cb15-620"><a href="#cb15-620" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Reverse causality:** healthier babies more likely to successfully breastfeed</span>
+<span id="cb15-621"><a href="#cb15-621" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Common cause:** healthier / richer parents have healthier babies and are more likely to breastfeed</span>
+<span id="cb15-622"><a href="#cb15-622" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-623"><a href="#cb15-623" aria-hidden="true" tabindex="-1"></a>We cannot tell which explanations are true (or to what extent) just by observing ($x$,$y$) pairs. Additionally, causal questions implicitly involve **counterfactuals**, events that didn't happen. For example, we could ask, **would** the **same** breastfed babies have been less healthy **if** they hadn’t been breastfed? Explanation 1 from above implies they would be, but explanations 2 and 3 do not. </span>
+<span id="cb15-624"><a href="#cb15-624" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-625"><a href="#cb15-625" aria-hidden="true" tabindex="-1"></a><span class="fu">### Confounders</span></span>
+<span id="cb15-626"><a href="#cb15-626" aria-hidden="true" tabindex="-1"></a>Let T represent a treatment (for example, alcohol use) and Y represent an outcome (for example, lung cancer).</span>
+<span id="cb15-627"><a href="#cb15-627" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-628"><a href="#cb15-628" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/confounder.png" alt='confounder' width='600'&gt;</span>
+<span id="cb15-629"><a href="#cb15-629" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-630"><a href="#cb15-630" aria-hidden="true" tabindex="-1"></a>A **confounder** is a variable that affects both T and Y, distorting the correlation between them. Using the example above, rich parents could be a confounder for breastfeeding and a baby's health. Confounders can be a measured covariate (a feature) or an unmeasured variable we don’t know about, and they generally cause problems, as the relationship between T and Y is affected by data we cannot see. We commonly *assume that all confounders are observed* (this is also called **ignorability**).</span>
+<span id="cb15-631"><a href="#cb15-631" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-632"><a href="#cb15-632" aria-hidden="true" tabindex="-1"></a><span class="fu">### How to perform causal inference?</span></span>
+<span id="cb15-633"><a href="#cb15-633" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-634"><a href="#cb15-634" aria-hidden="true" tabindex="-1"></a>In a **randomized experiment**, participants are randomly assigned into two groups: treatment and control. A treatment is applied *only* to the treatment group. We assume ignorability and gather as many measurements as possible so that we can compare them between the control and treatment groups to determine whether or not the treatment has a true effect or is just a confounding factor. </span>
+<span id="cb15-635"><a href="#cb15-635" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-636"><a href="#cb15-636" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/experiment.png" alt='experiment' width='600'&gt;</span>
+<span id="cb15-637"><a href="#cb15-637" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-638"><a href="#cb15-638" aria-hidden="true" tabindex="-1"></a>However, often, randomly assigning treatments is impractical or unethical. For example, assigning a treatment of cigarettes to test the effect of smoking on the lungs would not only be impractical but also unethical.</span>
+<span id="cb15-639"><a href="#cb15-639" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-640"><a href="#cb15-640" aria-hidden="true" tabindex="-1"></a>An alternative to bypass this issue is to utilize **observational studies**. This can be done by obtaining two participant groups separated based on some identified treatment variable. Unlike randomized experiments, however, we cannot assume ignorability here: the participants could have separated into two groups based on other covariates! In addition, there could also be unmeasured confounders.</span>
+<span id="cb15-641"><a href="#cb15-641" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-642"><a href="#cb15-642" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/observational.png" alt='observational' width='600'&gt;</span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/inference_causality/inference_causality_files/figure-html/cell-6-output-1.png b/docs/inference_causality/inference_causality_files/figure-html/cell-6-output-1.png
new file mode 100644
index 000000000..b65533584
Binary files /dev/null and b/docs/inference_causality/inference_causality_files/figure-html/cell-6-output-1.png differ
diff --git a/docs/inference_causality/inference_causality_files/figure-html/cell-8-output-1.png b/docs/inference_causality/inference_causality_files/figure-html/cell-8-output-1.png
new file mode 100644
index 000000000..659f736a6
Binary files /dev/null and b/docs/inference_causality/inference_causality_files/figure-html/cell-8-output-1.png differ
diff --git a/docs/intro_lec/introduction.html b/docs/intro_lec/introduction.html
index 9765b7773..9e123bb15 100644
--- a/docs/intro_lec/introduction.html
+++ b/docs/intro_lec/introduction.html
@@ -30,6 +30,7 @@
 <script src="../site_libs/quarto-search/fuse.min.js"></script>
 <script src="../site_libs/quarto-search/quarto-search.js"></script>
 <meta name="quarto:offset" content="../">
+<link href="../pandas_1/pandas_1.html" rel="next">
 <link href="../index.html" rel="prev">
 <link href="../data100_logo.png" rel="icon" type="image/png">
 <script src="../site_libs/quarto-html/quarto.js"></script>
@@ -122,6 +123,156 @@
   <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
   </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
 </li>
     </ul>
     </div>
@@ -812,6 +963,9 @@ <h2 data-number="1.2" class="anchored" data-anchor-id="conclusion"><span class="
       </a>          
   </div>
   <div class="nav-page nav-page-next">
+      <a href="../pandas_1/pandas_1.html" class="pagination-link" aria-label="Pandas I">
+        <span class="nav-page-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
   </div>
 </nav>
 </div> <!-- /content -->
diff --git a/docs/intro_to_modeling/images/reg_line_1.png b/docs/intro_to_modeling/images/reg_line_1.png
new file mode 100644
index 000000000..f85fd0635
Binary files /dev/null and b/docs/intro_to_modeling/images/reg_line_1.png differ
diff --git a/docs/intro_to_modeling/images/reg_line_2.png b/docs/intro_to_modeling/images/reg_line_2.png
new file mode 100644
index 000000000..10f5246c1
Binary files /dev/null and b/docs/intro_to_modeling/images/reg_line_2.png differ
diff --git a/docs/intro_to_modeling/intro_to_modeling.html b/docs/intro_to_modeling/intro_to_modeling.html
new file mode 100644
index 000000000..3c515d514
--- /dev/null
+++ b/docs/intro_to_modeling/intro_to_modeling.html
@@ -0,0 +1,1615 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>10&nbsp; Introduction to Modeling – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../constant_model_loss_transformations/loss_transformations.html" rel="next">
+<link href="../sampling/sampling.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../intro_to_modeling/intro_to_modeling.html"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Introduction to Modeling</h2>
+   
+  <ul>
+  <li><a href="#what-is-a-model" id="toc-what-is-a-model" class="nav-link active" data-scroll-target="#what-is-a-model"><span class="header-section-number">10.1</span> What is a Model?</a>
+  <ul>
+  <li><a href="#reasons-for-building-models" id="toc-reasons-for-building-models" class="nav-link" data-scroll-target="#reasons-for-building-models"><span class="header-section-number">10.1.1</span> Reasons for Building Models</a></li>
+  <li><a href="#common-types-of-models" id="toc-common-types-of-models" class="nav-link" data-scroll-target="#common-types-of-models"><span class="header-section-number">10.1.2</span> Common Types of Models</a></li>
+  </ul></li>
+  <li><a href="#simple-linear-regression" id="toc-simple-linear-regression" class="nav-link" data-scroll-target="#simple-linear-regression"><span class="header-section-number">10.2</span> Simple Linear Regression</a>
+  <ul>
+  <li><a href="#notations-and-definitions" id="toc-notations-and-definitions" class="nav-link" data-scroll-target="#notations-and-definitions"><span class="header-section-number">10.2.1</span> Notations and Definitions</a>
+  <ul>
+  <li><a href="#standard-units" id="toc-standard-units" class="nav-link" data-scroll-target="#standard-units"><span class="header-section-number">10.2.1.1</span> Standard Units</a></li>
+  <li><a href="#correlation" id="toc-correlation" class="nav-link" data-scroll-target="#correlation"><span class="header-section-number">10.2.1.2</span> Correlation</a></li>
+  </ul></li>
+  <li><a href="#alternate-form" id="toc-alternate-form" class="nav-link" data-scroll-target="#alternate-form"><span class="header-section-number">10.2.2</span> Alternate Form</a></li>
+  <li><a href="#derivation" id="toc-derivation" class="nav-link" data-scroll-target="#derivation"><span class="header-section-number">10.2.3</span> Derivation</a></li>
+  </ul></li>
+  <li><a href="#the-modeling-process" id="toc-the-modeling-process" class="nav-link" data-scroll-target="#the-modeling-process"><span class="header-section-number">10.3</span> The Modeling Process</a></li>
+  <li><a href="#choosing-a-model" id="toc-choosing-a-model" class="nav-link" data-scroll-target="#choosing-a-model"><span class="header-section-number">10.4</span> Choosing a Model</a></li>
+  <li><a href="#choosing-a-loss-function" id="toc-choosing-a-loss-function" class="nav-link" data-scroll-target="#choosing-a-loss-function"><span class="header-section-number">10.5</span> Choosing a Loss Function</a></li>
+  <li><a href="#fitting-the-model" id="toc-fitting-the-model" class="nav-link" data-scroll-target="#fitting-the-model"><span class="header-section-number">10.6</span> Fitting the Model</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Understand what models are and how to carry out the four-step modeling process.</li>
+<li>Define the concept of loss and gain familiarity with <span class="math inline">\(L_1\)</span> and <span class="math inline">\(L_2\)</span> loss.</li>
+<li>Fit the Simple Linear Regression model using minimization techniques.</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Up until this point in the semester, we’ve focused on analyzing datasets. We’ve looked into the early stages of the data science lifecycle, focusing on the programming tools, visualization techniques, and data cleaning methods needed for data analysis.</p>
+<p>This lecture marks a shift in focus. We will move away from examining datasets to actually <em>using</em> our data to better understand the world. Specifically, the next sequence of lectures will explore predictive modeling: generating models to make some predictions about the world around us. In this lecture, we’ll introduce the conceptual framework for setting up a modeling task. In the next few lectures, we’ll put this framework into practice by implementing various kinds of models.</p>
+<section id="what-is-a-model" class="level2" data-number="10.1">
+<h2 data-number="10.1" class="anchored" data-anchor-id="what-is-a-model"><span class="header-section-number">10.1</span> What is a Model?</h2>
+<p>A model is an <strong>idealized representation</strong> of a system. A system is a set of principles or procedures according to which something functions. We live in a world full of systems: the procedure of turning on a light happens according to a specific set of rules dictating the flow of electricity. The truth behind how any event occurs is usually complex, and many times the specifics are unknown. The workings of the world can be viewed as its own giant procedure. Models seek to simplify the world and distill them into workable pieces.</p>
+<p>Example: We model the fall of an object on Earth as subject to a constant acceleration of <span class="math inline">\(9.81 m/s^2\)</span> due to gravity.</p>
+<ul>
+<li>While this describes the behavior of our system, it is merely an approximation.</li>
+<li>It doesn’t account for the effects of air resistance, local variations in gravity, etc.</li>
+<li>In practice, it’s accurate enough to be useful!</li>
+</ul>
+<section id="reasons-for-building-models" class="level3" data-number="10.1.1">
+<h3 data-number="10.1.1" class="anchored" data-anchor-id="reasons-for-building-models"><span class="header-section-number">10.1.1</span> Reasons for Building Models</h3>
+<p>Why do we want to build models? As far as data scientists and statisticians are concerned, there are three reasons, and each implies a different focus on modeling.</p>
+<ol type="1">
+<li><p>To explain complex phenomena occurring in the world we live in. Examples of this might be:</p>
+<ul>
+<li>How are the parents’ average height related to their children’s average height?</li>
+<li>How does an object’s velocity and acceleration impact how far it travels? (Physics: <span class="math inline">\(d = d_0 + vt + \frac{1}{2}at^2\)</span>)</li>
+</ul>
+<p>In these cases, we care about creating models that are <em>simple and interpretable</em>, allowing us to understand what the relationships between our variables are.</p></li>
+<li><p>To make accurate predictions about unseen data. Some examples include:</p>
+<ul>
+<li>Can we predict if an email is spam or not?</li>
+<li>Can we generate a one-sentence summary of this 10-page long article?</li>
+</ul>
+<p>When making predictions, we care more about making extremely accurate predictions, at the cost of having an uninterpretable model. These are sometimes called black-box models and are common in fields like deep learning.</p></li>
+<li><p>To measure the causal effects of one event on some other event. For example,</p>
+<ul>
+<li>Does smoking <em>cause</em> lung cancer?</li>
+<li>Does a job training program <em>cause</em> increases in employment and wages?</li>
+</ul>
+<p>This is a much harder question because most statistical tools are designed to infer association, not causation. We will not focus on this task in Data 100, but you can take other advanced classes on causal inference (e.g., Stat 156, Data 102) if you are intrigued!</p></li>
+</ol>
+<p>Most of the time, we aim to strike a balance between building <strong>interpretable</strong> models and building <strong>accurate models</strong>.</p>
+</section>
+<section id="common-types-of-models" class="level3" data-number="10.1.2">
+<h3 data-number="10.1.2" class="anchored" data-anchor-id="common-types-of-models"><span class="header-section-number">10.1.2</span> Common Types of Models</h3>
+<p>In general, models can be split into two categories:</p>
+<ol type="1">
+<li><p>Deterministic physical (mechanistic) models: Laws that govern how the world works.</p>
+<ul>
+<li><a href="https://en.wikipedia.org/wiki/Kepler%27s_laws_of_planetary_motion#Third_law">Kepler’s Third Law of Planetary Motion (1619)</a>: The ratio of the square of an object’s orbital period with the cube of the semi-major axis of its orbit is the same for all objects orbiting the same primary.
+<ul>
+<li><span class="math inline">\(T^2 \propto R^3\)</span></li>
+</ul></li>
+<li><a href="https://en.wikipedia.org/wiki/Newton%27s_laws_of_motion">Newton’s Laws: motion and gravitation (1687)</a>: Newton’s second law of motion models the relationship between the mass of an object and the force required to accelerate it.
+<ul>
+<li><span class="math inline">\(F = ma\)</span></li>
+<li><span class="math inline">\(F_g = G \frac{m_1 m_2}{r^2}\)</span> <br></li>
+</ul></li>
+</ul></li>
+<li><p>Probabilistic models: Models that attempt to understand how random processes evolve. These are more general and can be used to describe many phenomena in the real world. These models commonly make simplifying assumptions about the nature of the world.</p>
+<ul>
+<li><a href="https://en.wikipedia.org/wiki/Poisson_point_process">Poisson Process models</a>: Used to model random events that happen with some probability at any point in time and are strictly increasing in count, such as the arrival of customers at a store.</li>
+</ul></li>
+</ol>
+<p>Note: These specific models are not in the scope of Data 100 and exist to serve as motivation.</p>
+</section>
+</section>
+<section id="simple-linear-regression" class="level2" data-number="10.2">
+<h2 data-number="10.2" class="anchored" data-anchor-id="simple-linear-regression"><span class="header-section-number">10.2</span> Simple Linear Regression</h2>
+<p>The <strong>regression line</strong> is the unique straight line that minimizes the <strong>mean squared error</strong> of estimation among all straight lines. As with any straight line, it can be defined by a slope and a y-intercept:</p>
+<ul>
+<li><span class="math inline">\(\text{slope} = r \cdot \frac{\text{Standard Deviation of } y}{\text{Standard Deviation of }x}\)</span></li>
+<li><span class="math inline">\(y\text{-intercept} = \text{average of }y - \text{slope}\cdot\text{average of }x\)</span></li>
+<li><span class="math inline">\(\text{regression estimate} = y\text{-intercept} + \text{slope}\cdot\text{}x\)</span></li>
+<li><span class="math inline">\(\text{residual} =\text{observed }y - \text{regression estimate}\)</span></li>
+</ul>
+<div id="ed940773" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Set random seed for consistency </span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">43</span>)</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>plt.style.use(<span class="st">'default'</span>) </span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="co">#Generate random noise for plotting</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">100</span>)</span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> x <span class="op">*</span> <span class="fl">0.5</span> <span class="op">-</span> <span class="dv">1</span> <span class="op">+</span> np.random.randn(<span class="dv">100</span>) <span class="op">*</span> <span class="fl">0.3</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co">#plot regression line</span></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a>sns.regplot(x<span class="op">=</span>x,y<span class="op">=</span>y)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="intro_to_modeling_files/figure-html/cell-2-output-1.png" width="559" height="413" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<section id="notations-and-definitions" class="level3" data-number="10.2.1">
+<h3 data-number="10.2.1" class="anchored" data-anchor-id="notations-and-definitions"><span class="header-section-number">10.2.1</span> Notations and Definitions</h3>
+<p>For a pair of variables <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span> representing our data <span class="math inline">\(\mathcal{D} = \{(x_1, y_1), (x_2, y_2), \dots, (x_n, y_n)\}\)</span>, we denote their means/averages as <span class="math inline">\(\bar x\)</span> and <span class="math inline">\(\bar y\)</span> and standard deviations as <span class="math inline">\(\sigma_x\)</span> and <span class="math inline">\(\sigma_y\)</span>.</p>
+<section id="standard-units" class="level4" data-number="10.2.1.1">
+<h4 data-number="10.2.1.1" class="anchored" data-anchor-id="standard-units"><span class="header-section-number">10.2.1.1</span> Standard Units</h4>
+<p>A variable is represented in standard units if the following are true:</p>
+<ol type="1">
+<li>0 in standard units is equal to the mean (<span class="math inline">\(\bar{x}\)</span>) in the original variable’s units.</li>
+<li>An increase of 1 standard unit is an increase of 1 standard deviation (<span class="math inline">\(\sigma_x\)</span>) in the original variable’s units.</li>
+</ol>
+<p>To convert a variable <span class="math inline">\(x_i\)</span> into standard units, we subtract its mean from it and divide it by its standard deviation. For example, <span class="math inline">\(x_i\)</span> in standard units is <span class="math inline">\(\frac{x_i - \bar x}{\sigma_x}\)</span>.</p>
+</section>
+<section id="correlation" class="level4" data-number="10.2.1.2">
+<h4 data-number="10.2.1.2" class="anchored" data-anchor-id="correlation"><span class="header-section-number">10.2.1.2</span> Correlation</h4>
+<p>The correlation (<span class="math inline">\(r\)</span>) is the average of the product of <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span>, both measured in <em>standard units</em>.</p>
+<p><span class="math display">\[r = \frac{1}{n} \sum_{i=1}^n (\frac{x_i - \bar{x}}{\sigma_x})(\frac{y_i - \bar{y}}{\sigma_y})\]</span></p>
+<ol type="1">
+<li>Correlation measures the strength of a <strong>linear association</strong> between two variables.</li>
+<li>Correlations range between -1 and 1: <span class="math inline">\(|r| \leq 1\)</span>, with <span class="math inline">\(r=1\)</span> indicating perfect linear association, and <span class="math inline">\(r=-1\)</span> indicating perfect negative association. The closer <span class="math inline">\(r\)</span> is to <span class="math inline">\(0\)</span>, the weaker the linear association is.</li>
+<li>Correlation says nothing about causation and non-linear association. Correlation does <strong>not</strong> imply causation. When <span class="math inline">\(r = 0\)</span>, the two variables are uncorrelated. However, they could still be related through some non-linear relationship.</li>
+</ol>
+<div id="982bc4e2" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> plot_and_get_corr(ax, x, y, title):</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>    ax.set_xlim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>)</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    ax.set_ylim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>)</span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>    ax.set_xticks([])</span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>    ax.set_yticks([])</span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>    ax.scatter(x, y, alpha <span class="op">=</span> <span class="fl">0.73</span>)</span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>    r <span class="op">=</span> np.corrcoef(x, y)[<span class="dv">0</span>, <span class="dv">1</span>]</span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>    ax.set_title(title <span class="op">+</span> <span class="st">" (corr: </span><span class="sc">{}</span><span class="st">)"</span>.<span class="bu">format</span>(r.<span class="bu">round</span>(<span class="dv">2</span>)))</span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> r</span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>fig, axs <span class="op">=</span> plt.subplots(<span class="dv">2</span>, <span class="dv">2</span>, figsize <span class="op">=</span> (<span class="dv">10</span>, <span class="dv">10</span>))</span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Just noise</span></span>
+<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>x1, y1 <span class="op">=</span> np.random.randn(<span class="dv">2</span>, <span class="dv">100</span>)</span>
+<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a>corr1 <span class="op">=</span> plot_and_get_corr(axs[<span class="dv">0</span>, <span class="dv">0</span>], x1, y1, title <span class="op">=</span> <span class="st">"noise"</span>)</span>
+<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Strong linear</span></span>
+<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a>x2 <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">100</span>)</span>
+<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a>y2 <span class="op">=</span> x2 <span class="op">*</span> <span class="fl">0.5</span> <span class="op">-</span> <span class="dv">1</span> <span class="op">+</span> np.random.randn(<span class="dv">100</span>) <span class="op">*</span> <span class="fl">0.3</span></span>
+<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a>corr2 <span class="op">=</span> plot_and_get_corr(axs[<span class="dv">0</span>, <span class="dv">1</span>], x2, y2, title <span class="op">=</span> <span class="st">"strong linear"</span>)</span>
+<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a><span class="co"># Unequal spread</span></span>
+<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a>x3 <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">100</span>)</span>
+<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a>y3 <span class="op">=</span> <span class="op">-</span> x3<span class="op">/</span><span class="dv">3</span> <span class="op">+</span> np.random.randn(<span class="dv">100</span>)<span class="op">*</span>(x3)<span class="op">/</span><span class="fl">2.5</span></span>
+<span id="cb2-25"><a href="#cb2-25" aria-hidden="true" tabindex="-1"></a>corr3 <span class="op">=</span> plot_and_get_corr(axs[<span class="dv">1</span>, <span class="dv">0</span>], x3, y3, title <span class="op">=</span> <span class="st">"strong linear"</span>)</span>
+<span id="cb2-26"><a href="#cb2-26" aria-hidden="true" tabindex="-1"></a>extent <span class="op">=</span> axs[<span class="dv">1</span>, <span class="dv">0</span>].get_window_extent().transformed(fig.dpi_scale_trans.inverted())</span>
+<span id="cb2-27"><a href="#cb2-27" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-28"><a href="#cb2-28" aria-hidden="true" tabindex="-1"></a><span class="co"># Strong non-linear</span></span>
+<span id="cb2-29"><a href="#cb2-29" aria-hidden="true" tabindex="-1"></a>x4 <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">100</span>)</span>
+<span id="cb2-30"><a href="#cb2-30" aria-hidden="true" tabindex="-1"></a>y4 <span class="op">=</span> <span class="dv">2</span><span class="op">*</span>np.sin(x3 <span class="op">-</span> <span class="fl">1.5</span>) <span class="op">+</span> np.random.randn(<span class="dv">100</span>) <span class="op">*</span> <span class="fl">0.3</span></span>
+<span id="cb2-31"><a href="#cb2-31" aria-hidden="true" tabindex="-1"></a>corr4 <span class="op">=</span> plot_and_get_corr(axs[<span class="dv">1</span>, <span class="dv">1</span>], x4, y4, title <span class="op">=</span> <span class="st">"strong non-linear"</span>)</span>
+<span id="cb2-32"><a href="#cb2-32" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-33"><a href="#cb2-33" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="intro_to_modeling_files/figure-html/cell-3-output-1.png" width="794" height="811" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="alternate-form" class="level3" data-number="10.2.2">
+<h3 data-number="10.2.2" class="anchored" data-anchor-id="alternate-form"><span class="header-section-number">10.2.2</span> Alternate Form</h3>
+<p>When the variables <span class="math inline">\(y\)</span> and <span class="math inline">\(x\)</span> are measured in <em>standard units</em>, the regression line for predicting <span class="math inline">\(y\)</span> based on <span class="math inline">\(x\)</span> has slope <span class="math inline">\(r\)</span> and passes through the origin.</p>
+<p><span class="math display">\[\hat{y}_{su} = r \cdot x_{su}\]</span></p>
+<p><img src="images/reg_line_1.png" class="img-fluid"></p>
+<ul>
+<li>In the original units, this becomes</li>
+</ul>
+<p><span class="math display">\[\frac{\hat{y} - \bar{y}}{\sigma_y} = r \cdot \frac{x - \bar{x}}{\sigma_x}\]</span></p>
+<p><img src="images/reg_line_2.png" class="img-fluid"></p>
+</section>
+<section id="derivation" class="level3" data-number="10.2.3">
+<h3 data-number="10.2.3" class="anchored" data-anchor-id="derivation"><span class="header-section-number">10.2.3</span> Derivation</h3>
+<p>Starting from the top, we have our claimed form of the regression line, and we want to show that it is equivalent to the optimal linear regression line: <span class="math inline">\(\hat{y} = \hat{a} + \hat{b}x\)</span>.</p>
+<p>Recall:</p>
+<ul>
+<li><span class="math inline">\(\hat{b} = r \cdot \frac{\text{Standard Deviation of }y}{\text{Standard Deviation of }x}\)</span></li>
+<li><span class="math inline">\(\hat{a} = \text{average of }y - \text{slope}\cdot\text{average of }x\)</span></li>
+</ul>
+<div class="callout callout-style-simple callout-none no-icon">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Proof:</p>
+<p><span class="math display">\[\frac{\hat{y} - \bar{y}}{\sigma_y} = r \cdot \frac{x - \bar{x}}{\sigma_x}\]</span></p>
+<p>Multiply by <span class="math inline">\(\sigma_y\)</span>, and add <span class="math inline">\(\bar{y}\)</span> on both sides.</p>
+<p><span class="math display">\[\hat{y} = \sigma_y \cdot r \cdot \frac{x - \bar{x}}{\sigma_x} + \bar{y}\]</span></p>
+<p>Distribute coefficient <span class="math inline">\(\sigma_{y}\cdot r\)</span> to the <span class="math inline">\(\frac{x - \bar{x}}{\sigma_x}\)</span> term</p>
+<p><span class="math display">\[\hat{y} = (\frac{r\sigma_y}{\sigma_x} ) \cdot x + (\bar{y} - (\frac{r\sigma_y}{\sigma_x} ) \bar{x})\]</span></p>
+<p>We now see that we have a line that matches our claim:</p>
+<ul>
+<li>slope: <span class="math inline">\(r\cdot\frac{\text{SD of y}}{\text{SD of x}} = r\cdot\frac{\sigma_y}{\sigma_x}\)</span></li>
+<li>intercept: <span class="math inline">\(\bar{y} - \text{slope}\cdot \bar{x}\)</span></li>
+</ul>
+<p>Note that the error for the i-th datapoint is: <span class="math inline">\(e_i = y_i - \hat{y_i}\)</span></p>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="the-modeling-process" class="level2" data-number="10.3">
+<h2 data-number="10.3" class="anchored" data-anchor-id="the-modeling-process"><span class="header-section-number">10.3</span> The Modeling Process</h2>
+<p>At a high level, a model is a way of representing a system. In Data 100, we’ll treat a model as some mathematical rule we use to describe the relationship between variables.</p>
+<p>What variables are we modeling? Typically, we use a subset of the variables in our sample of collected data to model another variable in this data. To put this more formally, say we have the following dataset <span class="math inline">\(\mathcal{D}\)</span>:</p>
+<p><span class="math display">\[\mathcal{D} = \{(x_1, y_1), (x_2, y_2), ..., (x_n, y_n)\}\]</span></p>
+<p>Each pair of values <span class="math inline">\((x_i, y_i)\)</span> represents a datapoint. In a modeling setting, we call these <strong>observations</strong>. <span class="math inline">\(y_i\)</span> is the dependent variable we are trying to model, also called an <strong>output</strong> or <strong>response</strong>. <span class="math inline">\(x_i\)</span> is the independent variable inputted into the model to make predictions, also known as a <strong>feature</strong>.</p>
+<p>Our goal in modeling is to use the observed data <span class="math inline">\(\mathcal{D}\)</span> to predict the output variable <span class="math inline">\(y_i\)</span>. We denote each prediction as <span class="math inline">\(\hat{y}_i\)</span> (read: “y hat sub i”).</p>
+<p>How do we generate these predictions? Some examples of models we’ll encounter in the next few lectures are given below:</p>
+<p><span class="math display">\[\hat{y}_i = \theta\]</span> <span class="math display">\[\hat{y}_i = \theta_0 + \theta_1 x_i\]</span></p>
+<p>The examples above are known as <strong>parametric models</strong>. They relate the collected data, <span class="math inline">\(x_i\)</span>, to the prediction we make, <span class="math inline">\(\hat{y}_i\)</span>. A few parameters (<span class="math inline">\(\theta\)</span>, <span class="math inline">\(\theta_0\)</span>, <span class="math inline">\(\theta_1\)</span>) are used to describe the relationship between <span class="math inline">\(x_i\)</span> and <span class="math inline">\(\hat{y}_i\)</span>.</p>
+<p>Notice that we don’t immediately know the values of these parameters. While the features, <span class="math inline">\(x_i\)</span>, are taken from our observed data, we need to decide what values to give <span class="math inline">\(\theta\)</span>, <span class="math inline">\(\theta_0\)</span>, and <span class="math inline">\(\theta_1\)</span> ourselves. This is the heart of parametric modeling: <em>what parameter values should we choose so our model makes the best possible predictions?</em></p>
+<p>To choose our model parameters, we’ll work through the <strong>modeling process</strong>.</p>
+<ol type="1">
+<li>Choose a model: how should we represent the world?</li>
+<li>Choose a loss function: how do we quantify prediction error?</li>
+<li>Fit the model: how do we choose the best parameters of our model given our data?</li>
+<li>Evaluate model performance: how do we evaluate whether this process gave rise to a good model?</li>
+</ol>
+</section>
+<section id="choosing-a-model" class="level2" data-number="10.4">
+<h2 data-number="10.4" class="anchored" data-anchor-id="choosing-a-model"><span class="header-section-number">10.4</span> Choosing a Model</h2>
+<p>Our first step is choosing a model: defining the mathematical rule that describes the relationship between the features, <span class="math inline">\(x_i\)</span>, and predictions <span class="math inline">\(\hat{y}_i\)</span>.</p>
+<p>In <a href="https://inferentialthinking.com/chapters/15/4/Least_Squares_Regression.html">Data 8</a>, you learned about the <strong>Simple Linear Regression (SLR) model</strong>. You learned that the model takes the form: <span class="math display">\[\hat{y}_i = a + bx_i\]</span></p>
+<p>In Data 100, we’ll use slightly different notation: we will replace <span class="math inline">\(a\)</span> with <span class="math inline">\(\theta_0\)</span> and <span class="math inline">\(b\)</span> with <span class="math inline">\(\theta_1\)</span>. This will allow us to use the same notation when we explore more complex models later on in the course.</p>
+<p><span class="math display">\[\hat{y}_i = \theta_0 + \theta_1 x_i\]</span></p>
+<p>The parameters of the SLR model are <span class="math inline">\(\theta_0\)</span>, also called the intercept term, and <span class="math inline">\(\theta_1\)</span>, also called the slope term. To create an effective model, we want to choose values for <span class="math inline">\(\theta_0\)</span> and <span class="math inline">\(\theta_1\)</span> that most accurately predict the output variable. The “best” fitting model parameters are given the special names: <span class="math inline">\(\hat{\theta}_0\)</span> and <span class="math inline">\(\hat{\theta}_1\)</span>; they are the specific parameter values that allow our model to generate the best possible predictions.</p>
+<p>In Data 8, you learned that the best SLR model parameters are: <span class="math display">\[\hat{\theta}_0 = \bar{y} - \hat{\theta}_1\bar{x} \qquad \qquad \hat{\theta}_1 = r \frac{\sigma_y}{\sigma_x}\]</span></p>
+<p>A quick reminder on notation:</p>
+<ul>
+<li><span class="math inline">\(\bar{y}\)</span> and <span class="math inline">\(\bar{x}\)</span> indicate the mean value of <span class="math inline">\(y\)</span> and <span class="math inline">\(x\)</span>, respectively</li>
+<li><span class="math inline">\(\sigma_y\)</span> and <span class="math inline">\(\sigma_x\)</span> indicate the standard deviations of <span class="math inline">\(y\)</span> and <span class="math inline">\(x\)</span></li>
+<li><span class="math inline">\(r\)</span> is the <a href="https://inferentialthinking.com/chapters/15/1/Correlation.html#the-correlation-coefficient">correlation coefficient</a>, defined as the average of the product of <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span> measured in standard units: <span class="math inline">\(\frac{1}{n} \sum_{i=1}^n (\frac{x_i-\bar{x}}{\sigma_x})(\frac{y_i-\bar{y}}{\sigma_y})\)</span></li>
+</ul>
+<p>In Data 100, we want to understand <em>how</em> to derive these best model coefficients. To do so, we’ll introduce the concept of a loss function.</p>
+</section>
+<section id="choosing-a-loss-function" class="level2" data-number="10.5">
+<h2 data-number="10.5" class="anchored" data-anchor-id="choosing-a-loss-function"><span class="header-section-number">10.5</span> Choosing a Loss Function</h2>
+<p>We’ve talked about the idea of creating the “best” possible predictions. This begs the question: how do we decide how “good” or “bad” our model’s predictions are?</p>
+<p>A <strong>loss function</strong> characterizes the cost, error, or fit resulting from a particular choice of model or model parameters. This function, <span class="math inline">\(L(y, \hat{y})\)</span>, quantifies how “bad” or “far off” a single prediction by our model is from a true, observed value in our collected data.</p>
+<p>The choice of loss function for a particular model will affect the accuracy and computational cost of estimation, and it’ll also depend on the estimation task at hand. For example,</p>
+<ul>
+<li>Are outputs quantitative or qualitative?</li>
+<li>Do outliers matter?</li>
+<li>Are all errors equally costly? (e.g., a false negative on a cancer test is arguably more dangerous than a false positive)</li>
+</ul>
+<p>Regardless of the specific function used, a loss function should follow two basic principles:</p>
+<ul>
+<li>If the prediction <span class="math inline">\(\hat{y}_i\)</span> is <em>close</em> to the actual value <span class="math inline">\(y_i\)</span>, loss should be low.</li>
+<li>If the prediction <span class="math inline">\(\hat{y}_i\)</span> is <em>far</em> from the actual value <span class="math inline">\(y_i\)</span>, loss should be high.</li>
+</ul>
+<p>Two common choices of loss function are squared loss and absolute loss.</p>
+<p><strong>Squared loss</strong>, also known as <strong>L2 loss</strong>, computes loss as the square of the difference between the observed <span class="math inline">\(y_i\)</span> and predicted <span class="math inline">\(\hat{y}_i\)</span>: <span class="math display">\[L(y_i, \hat{y}_i) = (y_i - \hat{y}_i)^2\]</span></p>
+<p><strong>Absolute loss</strong>, also known as <strong>L1 loss</strong>, computes loss as the absolute difference between the observed <span class="math inline">\(y_i\)</span> and predicted <span class="math inline">\(\hat{y}_i\)</span>: <span class="math display">\[L(y_i, \hat{y}_i) = |y_i - \hat{y}_i|\]</span></p>
+<p>L1 and L2 loss give us a tool for quantifying our model’s performance on a single data point. This is a good start, but ideally, we want to understand how our model performs across our <em>entire</em> dataset. A natural way to do this is to compute the average loss across all data points in the dataset. This is known as the <strong>cost function</strong>, <span class="math inline">\(\hat{R}(\theta)\)</span>: <span class="math display">\[\hat{R}(\theta) = \frac{1}{n} \sum^n_{i=1} L(y_i, \hat{y}_i)\]</span></p>
+<p>The cost function has many names in the statistics literature. You may also encounter the terms:</p>
+<ul>
+<li>Empirical risk (this is why we give the cost function the name <span class="math inline">\(R\)</span>)</li>
+<li>Error function</li>
+<li>Average loss</li>
+</ul>
+<p>We can substitute our L1 and L2 loss into the cost function definition. The <strong>Mean Squared Error (MSE)</strong> is the average squared loss across a dataset: <span class="math display">\[\text{MSE} = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2\]</span></p>
+<p>The <strong>Mean Absolute Error (MAE)</strong> is the average absolute loss across a dataset: <span class="math display">\[\text{MAE}= \frac{1}{n} \sum_{i=1}^n |y_i - \hat{y}_i|\]</span></p>
+</section>
+<section id="fitting-the-model" class="level2" data-number="10.6">
+<h2 data-number="10.6" class="anchored" data-anchor-id="fitting-the-model"><span class="header-section-number">10.6</span> Fitting the Model</h2>
+<p>Now that we’ve established the concept of a loss function, we can return to our original goal of choosing model parameters. Specifically, we want to choose the best set of model parameters that will minimize the model’s cost on our dataset. This process is called fitting the model.</p>
+<p>We know from calculus that a function is minimized when (1) its first derivative is equal to zero and (2) its second derivative is positive. We often call the function being minimized the <strong>objective function</strong> (our objective is to find its minimum).</p>
+<p>To find the optimal model parameter, we:</p>
+<ol type="1">
+<li>Take the derivative of the cost function with respect to that parameter</li>
+<li>Set the derivative equal to 0</li>
+<li>Solve for the parameter</li>
+</ol>
+<p>We repeat this process for each parameter present in the model. For now, we’ll disregard the second derivative condition.</p>
+<p>To help us make sense of this process, let’s put it into action by deriving the optimal model parameters for simple linear regression using the mean squared error as our cost function. Remember: although the notation may look tricky, all we are doing is following the three steps above!</p>
+<p>Step 1: take the derivative of the cost function with respect to each model parameter. We substitute the SLR model, <span class="math inline">\(\hat{y}_i = \theta_0+\theta_1 x_i\)</span>, into the definition of MSE above and differentiate with respect to <span class="math inline">\(\theta_0\)</span> and <span class="math inline">\(\theta_1\)</span>. <span class="math display">\[\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 = \frac{1}{n} \sum_{i=1}^{n} (y_i - \theta_0 - \theta_1 x_i)^2\]</span></p>
+<p><span class="math display">\[\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n} y_i - \theta_0 - \theta_1 x_i\]</span></p>
+<p><span class="math display">\[\frac{\partial}{\partial \theta_1} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n} (y_i - \theta_0 - \theta_1 x_i)x_i\]</span></p>
+<p>Let’s walk through these derivations in more depth, starting with the derivative of MSE with respect to <span class="math inline">\(\theta_0\)</span>.</p>
+<p>Given our MSE above, we know that: <span class="math display">\[\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{\partial}{\partial \theta_0} \frac{1}{n} \sum_{i=1}^{n} {(y_i - \theta_0 - \theta_1 x_i)}^{2}\]</span></p>
+<p>Noting that the derivative of sum is equivalent to the sum of derivatives, this then becomes: <span class="math display">\[ = \frac{1}{n} \sum_{i=1}^{n} \frac{\partial}{\partial \theta_0} {(y_i - \theta_0 - \theta_1 x_i)}^{2}\]</span></p>
+<p>We can then apply the chain rule.</p>
+<p><span class="math display">\[ = \frac{1}{n} \sum_{i=1}^{n} 2 \cdot{(y_i - \theta_0 - \theta_1 x_i)}\dot(-1)\]</span></p>
+<p>Finally, we can simplify the constants, leaving us with our answer.</p>
+<p><span class="math display">\[\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n}{(y_i - \theta_0 - \theta_1 x_i)}\]</span></p>
+<p>Following the same procedure, we can take the derivative of MSE with respect to <span class="math inline">\(\theta_1\)</span>.</p>
+<p><span class="math display">\[\frac{\partial}{\partial \theta_1} \text{MSE} = \frac{\partial}{\partial \theta_1} \frac{1}{n} \sum_{i=1}^{n} {(y_i - \theta_0 - \theta_1 x_i)}^{2}\]</span></p>
+<p><span class="math display">\[ = \frac{1}{n} \sum_{i=1}^{n} \frac{\partial}{\partial \theta_1} {(y_i - \theta_0 - \theta_1 x_i)}^{2}\]</span></p>
+<p><span class="math display">\[ = \frac{1}{n} \sum_{i=1}^{n} 2 \dot{(y_i - \theta_0 - \theta_1 x_i)}\dot(-x_i)\]</span></p>
+<p><span class="math display">\[= \frac{-2}{n} \sum_{i=1}^{n} {(y_i - \theta_0 - \theta_1 x_i)}x_i\]</span></p>
+<p>Step 2: set the derivatives equal to 0. After simplifying terms, this produces two <strong>estimating equations</strong>. The best set of model parameters <span class="math inline">\((\hat{\theta}_0, \hat{\theta}_1)\)</span> <em>must</em> satisfy these two optimality conditions. <span class="math display">\[0 = \frac{-2}{n} \sum_{i=1}^{n} y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i \Longleftrightarrow \frac{1}{n}\sum_{i=1}^{n} y_i - \hat{y}_i = 0\]</span> <span class="math display">\[0 = \frac{-2}{n} \sum_{i=1}^{n} (y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i)x_i \Longleftrightarrow \frac{1}{n}\sum_{i=1}^{n} (y_i - \hat{y}_i)x_i = 0\]</span></p>
+<p>Step 3: solve the estimating equations to compute estimates for <span class="math inline">\(\hat{\theta}_0\)</span> and <span class="math inline">\(\hat{\theta}_1\)</span>.</p>
+<p>Taking the first equation gives the estimate of <span class="math inline">\(\hat{\theta}_0\)</span>: <span class="math display">\[\frac{1}{n} \sum_{i=1}^n y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i = 0 \]</span></p>
+<p><span class="math display">\[\left(\frac{1}{n} \sum_{i=1}^n y_i \right) - \hat{\theta}_0 - \hat{\theta}_1\left(\frac{1}{n} \sum_{i=1}^n x_i \right) = 0\]</span></p>
+<p><span class="math display">\[ \hat{\theta}_0 = \bar{y} - \hat{\theta}_1 \bar{x}\]</span></p>
+<p>With a bit more maneuvering, the second equation gives the estimate of <span class="math inline">\(\hat{\theta}_1\)</span>. Start by multiplying the first estimating equation by <span class="math inline">\(\bar{x}\)</span>, then subtracting the result from the second estimating equation.</p>
+<p><span class="math display">\[\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)x_i - \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)\bar{x} = 0 \]</span></p>
+<p><span class="math display">\[\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)(x_i - \bar{x}) = 0 \]</span></p>
+<p>Next, plug in <span class="math inline">\(\hat{y}_i = \hat{\theta}_0 + \hat{\theta}_1 x_i = \bar{y} + \hat{\theta}_1(x_i - \bar{x})\)</span>:</p>
+<p><span class="math display">\[\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y} - \hat{\theta}_1(x - \bar{x}))(x_i - \bar{x}) = 0 \]</span></p>
+<p><span class="math display">\[\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y})(x_i - \bar{x}) = \hat{\theta}_1 \times \frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2
+\]</span></p>
+<p>By using the definition of correlation <span class="math inline">\(\left(r = \frac{1}{n} \sum_{i=1}^n (\frac{x_i-\bar{x}}{\sigma_x})(\frac{y_i-\bar{y}}{\sigma_y}) \right)\)</span> and standard deviation <span class="math inline">\(\left(\sigma_x = \sqrt{\frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2} \right)\)</span>, we can conclude: <span class="math display">\[r \sigma_x \sigma_y = \hat{\theta}_1 \times \sigma_x^2\]</span> <span class="math display">\[\hat{\theta}_1 = r \frac{\sigma_y}{\sigma_x}\]</span></p>
+<p>Just as was given in Data 8!</p>
+<p>Remember, this derivation found the optimal model parameters for SLR when using the MSE cost function. If we had used a different model or different loss function, we likely would have found different values for the best model parameters. However, regardless of the model and loss used, we can <em>always</em> follow these three steps to fit the model.</p>
+
+
+<!-- -->
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../sampling/sampling.html" class="pagination-link" aria-label="Sampling">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../constant_model_loss_transformations/loss_transformations.html" class="pagination-link" aria-label="Constant Model, Loss, and Transformations">
+        <span class="nav-page-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb3" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Introduction to Modeling</span></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Introduction to Modeling</span></span>
+<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span><span class="co"> python3</span></span>
+<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-19"><a href="#cb3-19" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb3-20"><a href="#cb3-20" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb3-21"><a href="#cb3-21" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Understand what models are and how to carry out the four-step modeling process.</span>
+<span id="cb3-22"><a href="#cb3-22" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Define the concept of loss and gain familiarity with $L_1$ and $L_2$ loss.</span>
+<span id="cb3-23"><a href="#cb3-23" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Fit the Simple Linear Regression model using minimization techniques.</span>
+<span id="cb3-24"><a href="#cb3-24" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb3-25"><a href="#cb3-25" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-26"><a href="#cb3-26" aria-hidden="true" tabindex="-1"></a>Up until this point in the semester, we've focused on analyzing datasets. We've looked into the early stages of the data science lifecycle, focusing on the programming tools, visualization techniques, and data cleaning methods needed for data analysis.</span>
+<span id="cb3-27"><a href="#cb3-27" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-28"><a href="#cb3-28" aria-hidden="true" tabindex="-1"></a>This lecture marks a shift in focus. We will move away from examining datasets to actually *using* our data to better understand the world. Specifically, the next sequence of lectures will explore predictive modeling: generating models to make some predictions about the world around us. In this lecture, we'll introduce the conceptual framework for setting up a modeling task. In the next few lectures, we'll put this framework into practice by implementing various kinds of models.</span>
+<span id="cb3-29"><a href="#cb3-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-30"><a href="#cb3-30" aria-hidden="true" tabindex="-1"></a><span class="fu">## What is a Model?</span></span>
+<span id="cb3-31"><a href="#cb3-31" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-32"><a href="#cb3-32" aria-hidden="true" tabindex="-1"></a>A model is an **idealized representation** of a system. A system is a set of principles or procedures according to which something functions. We live in a world full of systems: the procedure of turning on a light happens according to a specific set of rules dictating the flow of electricity. The truth behind how any event occurs is usually complex, and many times the specifics are unknown. The workings of the world can be viewed as its own giant procedure. Models seek to simplify the world and distill them into workable pieces.  </span>
+<span id="cb3-33"><a href="#cb3-33" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-34"><a href="#cb3-34" aria-hidden="true" tabindex="-1"></a>Example:</span>
+<span id="cb3-35"><a href="#cb3-35" aria-hidden="true" tabindex="-1"></a>We model the fall of an object on Earth as subject to a constant acceleration of $9.81 m/s^2$ due to gravity.</span>
+<span id="cb3-36"><a href="#cb3-36" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-37"><a href="#cb3-37" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>While this describes the behavior of our system, it is merely an approximation.</span>
+<span id="cb3-38"><a href="#cb3-38" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>It doesn’t account for the effects of air resistance, local variations in gravity, etc.</span>
+<span id="cb3-39"><a href="#cb3-39" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>In practice, it’s accurate enough to be useful!</span>
+<span id="cb3-40"><a href="#cb3-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-41"><a href="#cb3-41" aria-hidden="true" tabindex="-1"></a><span class="fu">### Reasons for Building Models</span></span>
+<span id="cb3-42"><a href="#cb3-42" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-43"><a href="#cb3-43" aria-hidden="true" tabindex="-1"></a>Why do we want to build models? As far as data scientists and statisticians are concerned, there are three reasons, and each implies a different focus on modeling.</span>
+<span id="cb3-44"><a href="#cb3-44" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-45"><a href="#cb3-45" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>To explain complex phenomena occurring in the world we live in. Examples of this might be:</span>
+<span id="cb3-46"><a href="#cb3-46" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-47"><a href="#cb3-47" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>How are the parents' average height related to their children's average height?</span>
+<span id="cb3-48"><a href="#cb3-48" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>How does an object’s velocity and acceleration impact how far it travels? (Physics: $d = d_0 + vt + \frac{1}{2}at^2$) </span>
+<span id="cb3-49"><a href="#cb3-49" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb3-50"><a href="#cb3-50" aria-hidden="true" tabindex="-1"></a>    In these cases, we care about creating models that are *simple and interpretable*, allowing us to understand what the relationships between our variables are.</span>
+<span id="cb3-51"><a href="#cb3-51" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-52"><a href="#cb3-52" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>To make accurate predictions about unseen data. Some examples include:</span>
+<span id="cb3-53"><a href="#cb3-53" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-54"><a href="#cb3-54" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Can we predict if an email is spam or not?</span>
+<span id="cb3-55"><a href="#cb3-55" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Can we generate a one-sentence summary of this 10-page long article?</span>
+<span id="cb3-56"><a href="#cb3-56" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-57"><a href="#cb3-57" aria-hidden="true" tabindex="-1"></a>    When making predictions, we care more about making extremely accurate predictions, at the cost of having an uninterpretable model. These are sometimes called black-box models and are common in fields like deep learning.</span>
+<span id="cb3-58"><a href="#cb3-58" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-59"><a href="#cb3-59" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>To measure the causal effects of one event on some other event. For example,</span>
+<span id="cb3-60"><a href="#cb3-60" aria-hidden="true" tabindex="-1"></a>   </span>
+<span id="cb3-61"><a href="#cb3-61" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Does smoking *cause* lung cancer?</span>
+<span id="cb3-62"><a href="#cb3-62" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span>Does a job training program *cause* increases in employment and wages?</span>
+<span id="cb3-63"><a href="#cb3-63" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-64"><a href="#cb3-64" aria-hidden="true" tabindex="-1"></a>    This is a much harder question because most statistical tools are designed to infer association, not causation. We will not focus on this task in Data 100, but you can take other advanced classes on causal inference (e.g., Stat 156, Data 102) if you are intrigued! </span>
+<span id="cb3-65"><a href="#cb3-65" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-66"><a href="#cb3-66" aria-hidden="true" tabindex="-1"></a>Most of the time, we aim to strike a balance between building **interpretable** models and building **accurate models**.</span>
+<span id="cb3-67"><a href="#cb3-67" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-68"><a href="#cb3-68" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-69"><a href="#cb3-69" aria-hidden="true" tabindex="-1"></a><span class="fu">### Common Types of Models</span></span>
+<span id="cb3-70"><a href="#cb3-70" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-71"><a href="#cb3-71" aria-hidden="true" tabindex="-1"></a>In general, models can be split into two categories:</span>
+<span id="cb3-72"><a href="#cb3-72" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-73"><a href="#cb3-73" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Deterministic physical (mechanistic) models: Laws that govern how the world works.</span>
+<span id="cb3-74"><a href="#cb3-74" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-75"><a href="#cb3-75" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span><span class="co">[</span><span class="ot">Kepler's Third Law of Planetary Motion (1619)</span><span class="co">](https://en.wikipedia.org/wiki/Kepler%27s_laws_of_planetary_motion#Third_law)</span>: The ratio of the square of an object's orbital period with the cube of the semi-major axis of its orbit is the same for all objects orbiting the same primary.</span>
+<span id="cb3-76"><a href="#cb3-76" aria-hidden="true" tabindex="-1"></a><span class="ss">        - </span>$T^2 \propto R^3$</span>
+<span id="cb3-77"><a href="#cb3-77" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-78"><a href="#cb3-78" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span><span class="co">[</span><span class="ot">Newton's Laws: motion and gravitation (1687)</span><span class="co">](https://en.wikipedia.org/wiki/Newton%27s_laws_of_motion)</span>: Newton’s second law of motion models the relationship between the mass of an object and the force required to accelerate it.</span>
+<span id="cb3-79"><a href="#cb3-79" aria-hidden="true" tabindex="-1"></a><span class="ss">        - </span>$F = ma$</span>
+<span id="cb3-80"><a href="#cb3-80" aria-hidden="true" tabindex="-1"></a><span class="ss">        - </span>$F_g = G \frac{m_1 m_2}{r^2}$</span>
+<span id="cb3-81"><a href="#cb3-81" aria-hidden="true" tabindex="-1"></a>&lt;br&gt;</span>
+<span id="cb3-82"><a href="#cb3-82" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-83"><a href="#cb3-83" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Probabilistic models: Models that attempt to understand how random processes evolve. These are more general and can be used to describe many phenomena in the real world. These models commonly make simplifying assumptions about the nature of the world.</span>
+<span id="cb3-84"><a href="#cb3-84" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-85"><a href="#cb3-85" aria-hidden="true" tabindex="-1"></a><span class="ss">    - </span><span class="co">[</span><span class="ot">Poisson Process models</span><span class="co">](https://en.wikipedia.org/wiki/Poisson_point_process)</span>: Used to model random events that happen with some probability at any point in time and are strictly increasing in count, such as the arrival of customers at a store. </span>
+<span id="cb3-86"><a href="#cb3-86" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-87"><a href="#cb3-87" aria-hidden="true" tabindex="-1"></a>Note: These specific models are not in the scope of Data 100 and exist to serve as motivation.</span>
+<span id="cb3-88"><a href="#cb3-88" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-89"><a href="#cb3-89" aria-hidden="true" tabindex="-1"></a><span class="fu">## Simple Linear Regression </span></span>
+<span id="cb3-90"><a href="#cb3-90" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-91"><a href="#cb3-91" aria-hidden="true" tabindex="-1"></a>The **regression line** is the unique straight line that minimizes the **mean squared error** of estimation among all straight lines. As with any straight line, it can be defined by a slope and a y-intercept:</span>
+<span id="cb3-92"><a href="#cb3-92" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-93"><a href="#cb3-93" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>$\text{slope} = r \cdot \frac{\text{Standard Deviation of } y}{\text{Standard Deviation of }x}$</span>
+<span id="cb3-94"><a href="#cb3-94" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>$y\text{-intercept} = \text{average of }y - \text{slope}\cdot\text{average of }x$</span>
+<span id="cb3-95"><a href="#cb3-95" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>$\text{regression estimate} = y\text{-intercept} + \text{slope}\cdot\text{}x$</span>
+<span id="cb3-96"><a href="#cb3-96" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>$\text{residual} =\text{observed }y - \text{regression estimate}$</span>
+<span id="cb3-97"><a href="#cb3-97" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-100"><a href="#cb3-100" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb3-101"><a href="#cb3-101" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb3-102"><a href="#cb3-102" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb3-103"><a href="#cb3-103" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb3-104"><a href="#cb3-104" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb3-105"><a href="#cb3-105" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb3-106"><a href="#cb3-106" aria-hidden="true" tabindex="-1"></a><span class="co"># Set random seed for consistency </span></span>
+<span id="cb3-107"><a href="#cb3-107" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">43</span>)</span>
+<span id="cb3-108"><a href="#cb3-108" aria-hidden="true" tabindex="-1"></a>plt.style.use(<span class="st">'default'</span>) </span>
+<span id="cb3-109"><a href="#cb3-109" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-110"><a href="#cb3-110" aria-hidden="true" tabindex="-1"></a><span class="co">#Generate random noise for plotting</span></span>
+<span id="cb3-111"><a href="#cb3-111" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">100</span>)</span>
+<span id="cb3-112"><a href="#cb3-112" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> x <span class="op">*</span> <span class="fl">0.5</span> <span class="op">-</span> <span class="dv">1</span> <span class="op">+</span> np.random.randn(<span class="dv">100</span>) <span class="op">*</span> <span class="fl">0.3</span></span>
+<span id="cb3-113"><a href="#cb3-113" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-114"><a href="#cb3-114" aria-hidden="true" tabindex="-1"></a><span class="co">#plot regression line</span></span>
+<span id="cb3-115"><a href="#cb3-115" aria-hidden="true" tabindex="-1"></a>sns.regplot(x<span class="op">=</span>x,y<span class="op">=</span>y)<span class="op">;</span></span>
+<span id="cb3-116"><a href="#cb3-116" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb3-117"><a href="#cb3-117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-118"><a href="#cb3-118" aria-hidden="true" tabindex="-1"></a><span class="fu">### Notations and Definitions</span></span>
+<span id="cb3-119"><a href="#cb3-119" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-120"><a href="#cb3-120" aria-hidden="true" tabindex="-1"></a>For a pair of variables $x$ and $y$ representing our data $\mathcal{D} = <span class="sc">\{</span>(x_1, y_1), (x_2, y_2), \dots, (x_n, y_n)<span class="sc">\}</span>$, we denote their means/averages as $\bar x$ and $\bar y$ and standard deviations as $\sigma_x$ and $\sigma_y$.</span>
+<span id="cb3-121"><a href="#cb3-121" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-122"><a href="#cb3-122" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Standard Units</span></span>
+<span id="cb3-123"><a href="#cb3-123" aria-hidden="true" tabindex="-1"></a>A variable is represented in standard units if the following are true:</span>
+<span id="cb3-124"><a href="#cb3-124" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-125"><a href="#cb3-125" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>0 in standard units is equal to the mean ($\bar{x}$) in the original variable's units.</span>
+<span id="cb3-126"><a href="#cb3-126" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>An increase of 1 standard unit is an increase of 1 standard deviation ($\sigma_x$) in the original variable's units.</span>
+<span id="cb3-127"><a href="#cb3-127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-128"><a href="#cb3-128" aria-hidden="true" tabindex="-1"></a>To convert a variable $x_i$ into standard units, we subtract its mean from it and divide it by its standard deviation. For example, $x_i$ in standard units is $\frac{x_i - \bar x}{\sigma_x}$.</span>
+<span id="cb3-129"><a href="#cb3-129" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-130"><a href="#cb3-130" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Correlation</span></span>
+<span id="cb3-131"><a href="#cb3-131" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-132"><a href="#cb3-132" aria-hidden="true" tabindex="-1"></a>The correlation ($r$) is the average of the product of $x$ and $y$, both measured in *standard units*.</span>
+<span id="cb3-133"><a href="#cb3-133" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-134"><a href="#cb3-134" aria-hidden="true" tabindex="-1"></a>$$r = \frac{1}{n} \sum_{i=1}^n (\frac{x_i - \bar{x}}{\sigma_x})(\frac{y_i - \bar{y}}{\sigma_y})$$</span>
+<span id="cb3-135"><a href="#cb3-135" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-136"><a href="#cb3-136" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Correlation measures the strength of a **linear association** between two variables.</span>
+<span id="cb3-137"><a href="#cb3-137" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Correlations range between -1 and 1: $|r| \leq 1$, with $r=1$ indicating perfect linear association, and $r=-1$ indicating perfect negative association. The closer $r$ is to $0$, the weaker the linear association is.</span>
+<span id="cb3-138"><a href="#cb3-138" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Correlation says nothing about causation and non-linear association. Correlation does **not** imply causation. When $r = 0$, the two variables are uncorrelated. However, they could still be related through some non-linear relationship.</span>
+<span id="cb3-139"><a href="#cb3-139" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-142"><a href="#cb3-142" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb3-143"><a href="#cb3-143" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb3-144"><a href="#cb3-144" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> plot_and_get_corr(ax, x, y, title):</span>
+<span id="cb3-145"><a href="#cb3-145" aria-hidden="true" tabindex="-1"></a>    ax.set_xlim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>)</span>
+<span id="cb3-146"><a href="#cb3-146" aria-hidden="true" tabindex="-1"></a>    ax.set_ylim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>)</span>
+<span id="cb3-147"><a href="#cb3-147" aria-hidden="true" tabindex="-1"></a>    ax.set_xticks([])</span>
+<span id="cb3-148"><a href="#cb3-148" aria-hidden="true" tabindex="-1"></a>    ax.set_yticks([])</span>
+<span id="cb3-149"><a href="#cb3-149" aria-hidden="true" tabindex="-1"></a>    ax.scatter(x, y, alpha <span class="op">=</span> <span class="fl">0.73</span>)</span>
+<span id="cb3-150"><a href="#cb3-150" aria-hidden="true" tabindex="-1"></a>    r <span class="op">=</span> np.corrcoef(x, y)[<span class="dv">0</span>, <span class="dv">1</span>]</span>
+<span id="cb3-151"><a href="#cb3-151" aria-hidden="true" tabindex="-1"></a>    ax.set_title(title <span class="op">+</span> <span class="st">" (corr: </span><span class="sc">{}</span><span class="st">)"</span>.<span class="bu">format</span>(r.<span class="bu">round</span>(<span class="dv">2</span>)))</span>
+<span id="cb3-152"><a href="#cb3-152" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> r</span>
+<span id="cb3-153"><a href="#cb3-153" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-154"><a href="#cb3-154" aria-hidden="true" tabindex="-1"></a>fig, axs <span class="op">=</span> plt.subplots(<span class="dv">2</span>, <span class="dv">2</span>, figsize <span class="op">=</span> (<span class="dv">10</span>, <span class="dv">10</span>))</span>
+<span id="cb3-155"><a href="#cb3-155" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-156"><a href="#cb3-156" aria-hidden="true" tabindex="-1"></a><span class="co"># Just noise</span></span>
+<span id="cb3-157"><a href="#cb3-157" aria-hidden="true" tabindex="-1"></a>x1, y1 <span class="op">=</span> np.random.randn(<span class="dv">2</span>, <span class="dv">100</span>)</span>
+<span id="cb3-158"><a href="#cb3-158" aria-hidden="true" tabindex="-1"></a>corr1 <span class="op">=</span> plot_and_get_corr(axs[<span class="dv">0</span>, <span class="dv">0</span>], x1, y1, title <span class="op">=</span> <span class="st">"noise"</span>)</span>
+<span id="cb3-159"><a href="#cb3-159" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-160"><a href="#cb3-160" aria-hidden="true" tabindex="-1"></a><span class="co"># Strong linear</span></span>
+<span id="cb3-161"><a href="#cb3-161" aria-hidden="true" tabindex="-1"></a>x2 <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">100</span>)</span>
+<span id="cb3-162"><a href="#cb3-162" aria-hidden="true" tabindex="-1"></a>y2 <span class="op">=</span> x2 <span class="op">*</span> <span class="fl">0.5</span> <span class="op">-</span> <span class="dv">1</span> <span class="op">+</span> np.random.randn(<span class="dv">100</span>) <span class="op">*</span> <span class="fl">0.3</span></span>
+<span id="cb3-163"><a href="#cb3-163" aria-hidden="true" tabindex="-1"></a>corr2 <span class="op">=</span> plot_and_get_corr(axs[<span class="dv">0</span>, <span class="dv">1</span>], x2, y2, title <span class="op">=</span> <span class="st">"strong linear"</span>)</span>
+<span id="cb3-164"><a href="#cb3-164" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-165"><a href="#cb3-165" aria-hidden="true" tabindex="-1"></a><span class="co"># Unequal spread</span></span>
+<span id="cb3-166"><a href="#cb3-166" aria-hidden="true" tabindex="-1"></a>x3 <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">100</span>)</span>
+<span id="cb3-167"><a href="#cb3-167" aria-hidden="true" tabindex="-1"></a>y3 <span class="op">=</span> <span class="op">-</span> x3<span class="op">/</span><span class="dv">3</span> <span class="op">+</span> np.random.randn(<span class="dv">100</span>)<span class="op">*</span>(x3)<span class="op">/</span><span class="fl">2.5</span></span>
+<span id="cb3-168"><a href="#cb3-168" aria-hidden="true" tabindex="-1"></a>corr3 <span class="op">=</span> plot_and_get_corr(axs[<span class="dv">1</span>, <span class="dv">0</span>], x3, y3, title <span class="op">=</span> <span class="st">"strong linear"</span>)</span>
+<span id="cb3-169"><a href="#cb3-169" aria-hidden="true" tabindex="-1"></a>extent <span class="op">=</span> axs[<span class="dv">1</span>, <span class="dv">0</span>].get_window_extent().transformed(fig.dpi_scale_trans.inverted())</span>
+<span id="cb3-170"><a href="#cb3-170" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-171"><a href="#cb3-171" aria-hidden="true" tabindex="-1"></a><span class="co"># Strong non-linear</span></span>
+<span id="cb3-172"><a href="#cb3-172" aria-hidden="true" tabindex="-1"></a>x4 <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">3</span>, <span class="dv">3</span>, <span class="dv">100</span>)</span>
+<span id="cb3-173"><a href="#cb3-173" aria-hidden="true" tabindex="-1"></a>y4 <span class="op">=</span> <span class="dv">2</span><span class="op">*</span>np.sin(x3 <span class="op">-</span> <span class="fl">1.5</span>) <span class="op">+</span> np.random.randn(<span class="dv">100</span>) <span class="op">*</span> <span class="fl">0.3</span></span>
+<span id="cb3-174"><a href="#cb3-174" aria-hidden="true" tabindex="-1"></a>corr4 <span class="op">=</span> plot_and_get_corr(axs[<span class="dv">1</span>, <span class="dv">1</span>], x4, y4, title <span class="op">=</span> <span class="st">"strong non-linear"</span>)</span>
+<span id="cb3-175"><a href="#cb3-175" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-176"><a href="#cb3-176" aria-hidden="true" tabindex="-1"></a>plt.show()</span>
+<span id="cb3-177"><a href="#cb3-177" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb3-178"><a href="#cb3-178" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-179"><a href="#cb3-179" aria-hidden="true" tabindex="-1"></a><span class="fu">### Alternate Form</span></span>
+<span id="cb3-180"><a href="#cb3-180" aria-hidden="true" tabindex="-1"></a>When the variables $y$ and $x$ are measured in *standard units*, the regression line for predicting $y$ based on $x$ has slope $r$ and passes through the origin.</span>
+<span id="cb3-181"><a href="#cb3-181" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-182"><a href="#cb3-182" aria-hidden="true" tabindex="-1"></a> $$\hat{y}_{su} = r \cdot x_{su}$$</span>
+<span id="cb3-183"><a href="#cb3-183" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-184"><a href="#cb3-184" aria-hidden="true" tabindex="-1"></a><span class="al">![](images/reg_line_1.png)</span></span>
+<span id="cb3-185"><a href="#cb3-185" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-186"><a href="#cb3-186" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>In the original units, this becomes</span>
+<span id="cb3-187"><a href="#cb3-187" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-188"><a href="#cb3-188" aria-hidden="true" tabindex="-1"></a>$$\frac{\hat{y} - \bar{y}}{\sigma_y} = r \cdot \frac{x - \bar{x}}{\sigma_x}$$</span>
+<span id="cb3-189"><a href="#cb3-189" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-190"><a href="#cb3-190" aria-hidden="true" tabindex="-1"></a><span class="al">![](images/reg_line_2.png)</span></span>
+<span id="cb3-191"><a href="#cb3-191" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-192"><a href="#cb3-192" aria-hidden="true" tabindex="-1"></a><span class="fu">### Derivation</span></span>
+<span id="cb3-193"><a href="#cb3-193" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-194"><a href="#cb3-194" aria-hidden="true" tabindex="-1"></a>Starting from the top, we have our claimed form of the regression line, and we want to show that it is equivalent to the optimal linear regression line: $\hat{y} = \hat{a} + \hat{b}x$.</span>
+<span id="cb3-195"><a href="#cb3-195" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-196"><a href="#cb3-196" aria-hidden="true" tabindex="-1"></a>Recall: </span>
+<span id="cb3-197"><a href="#cb3-197" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-198"><a href="#cb3-198" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>$\hat{b} = r \cdot \frac{\text{Standard Deviation of }y}{\text{Standard Deviation of }x}$</span>
+<span id="cb3-199"><a href="#cb3-199" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>$\hat{a} = \text{average of }y - \text{slope}\cdot\text{average of }x$</span>
+<span id="cb3-200"><a href="#cb3-200" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-201"><a href="#cb3-201" aria-hidden="true" tabindex="-1"></a>:::{.callout}</span>
+<span id="cb3-202"><a href="#cb3-202" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-203"><a href="#cb3-203" aria-hidden="true" tabindex="-1"></a>Proof: </span>
+<span id="cb3-204"><a href="#cb3-204" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-205"><a href="#cb3-205" aria-hidden="true" tabindex="-1"></a>$$\frac{\hat{y} - \bar{y}}{\sigma_y} = r \cdot \frac{x - \bar{x}}{\sigma_x}$$</span>
+<span id="cb3-206"><a href="#cb3-206" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-207"><a href="#cb3-207" aria-hidden="true" tabindex="-1"></a>Multiply by $\sigma_y$, and add $\bar{y}$ on both sides.</span>
+<span id="cb3-208"><a href="#cb3-208" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-209"><a href="#cb3-209" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \sigma_y \cdot r \cdot \frac{x - \bar{x}}{\sigma_x} + \bar{y}$$</span>
+<span id="cb3-210"><a href="#cb3-210" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-211"><a href="#cb3-211" aria-hidden="true" tabindex="-1"></a>Distribute coefficient $\sigma_{y}\cdot r$ to the $\frac{x - \bar{x}}{\sigma_x}$ term</span>
+<span id="cb3-212"><a href="#cb3-212" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-213"><a href="#cb3-213" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = (\frac{r\sigma_y}{\sigma_x} ) \cdot x + (\bar{y} - (\frac{r\sigma_y}{\sigma_x} ) \bar{x})$$</span>
+<span id="cb3-214"><a href="#cb3-214" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-215"><a href="#cb3-215" aria-hidden="true" tabindex="-1"></a>We now see that we have a line that matches our claim:</span>
+<span id="cb3-216"><a href="#cb3-216" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-217"><a href="#cb3-217" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>slope: $r\cdot\frac{\text{SD of y}}{\text{SD of x}} = r\cdot\frac{\sigma_y}{\sigma_x}$</span>
+<span id="cb3-218"><a href="#cb3-218" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>intercept: $\bar{y} - \text{slope}\cdot \bar{x}$</span>
+<span id="cb3-219"><a href="#cb3-219" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-220"><a href="#cb3-220" aria-hidden="true" tabindex="-1"></a>Note that the error for the i-th datapoint is: $e_i = y_i - \hat{y_i}$</span>
+<span id="cb3-221"><a href="#cb3-221" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-222"><a href="#cb3-222" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb3-223"><a href="#cb3-223" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-224"><a href="#cb3-224" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-225"><a href="#cb3-225" aria-hidden="true" tabindex="-1"></a><span class="fu">## The Modeling Process</span></span>
+<span id="cb3-226"><a href="#cb3-226" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-227"><a href="#cb3-227" aria-hidden="true" tabindex="-1"></a>At a high level, a model is a way of representing a system. In Data 100, we'll treat a model as some mathematical rule we use to describe the relationship between variables. </span>
+<span id="cb3-228"><a href="#cb3-228" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-229"><a href="#cb3-229" aria-hidden="true" tabindex="-1"></a>What variables are we modeling? Typically, we use a subset of the variables in our sample of collected data to model another variable in this data. To put this more formally, say we have the following dataset $\mathcal{D}$:</span>
+<span id="cb3-230"><a href="#cb3-230" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-231"><a href="#cb3-231" aria-hidden="true" tabindex="-1"></a>$$\mathcal{D} = <span class="sc">\{</span>(x_1, y_1), (x_2, y_2), ..., (x_n, y_n)<span class="sc">\}</span>$$</span>
+<span id="cb3-232"><a href="#cb3-232" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-233"><a href="#cb3-233" aria-hidden="true" tabindex="-1"></a>Each pair of values $(x_i, y_i)$ represents a datapoint. In a modeling setting, we call these **observations**. $y_i$ is the dependent variable we are trying to model, also called an **output** or **response**. $x_i$ is the independent variable inputted into the model to make predictions, also known as a **feature**. </span>
+<span id="cb3-234"><a href="#cb3-234" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-235"><a href="#cb3-235" aria-hidden="true" tabindex="-1"></a>Our goal in modeling is to use the observed data $\mathcal{D}$ to predict the output variable $y_i$. We denote each prediction as $\hat{y}_i$ (read: "y hat sub i").</span>
+<span id="cb3-236"><a href="#cb3-236" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-237"><a href="#cb3-237" aria-hidden="true" tabindex="-1"></a>How do we generate these predictions? Some examples of models we'll encounter in the next few lectures are given below:</span>
+<span id="cb3-238"><a href="#cb3-238" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-239"><a href="#cb3-239" aria-hidden="true" tabindex="-1"></a>$$\hat{y}_i = \theta$$</span>
+<span id="cb3-240"><a href="#cb3-240" aria-hidden="true" tabindex="-1"></a>$$\hat{y}_i = \theta_0 + \theta_1 x_i$$</span>
+<span id="cb3-241"><a href="#cb3-241" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-242"><a href="#cb3-242" aria-hidden="true" tabindex="-1"></a>The examples above are known as **parametric models**. They relate the collected data, $x_i$, to the prediction we make, $\hat{y}_i$. A few parameters ($\theta$, $\theta_0$, $\theta_1$) are used to describe the relationship between $x_i$ and $\hat{y}_i$.</span>
+<span id="cb3-243"><a href="#cb3-243" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-244"><a href="#cb3-244" aria-hidden="true" tabindex="-1"></a>Notice that we don't immediately know the values of these parameters. While the features, $x_i$, are taken from our observed data, we need to decide what values to give $\theta$, $\theta_0$, and $\theta_1$ ourselves. This is the heart of parametric modeling: *what parameter values should we choose so our model makes the best possible predictions?*</span>
+<span id="cb3-245"><a href="#cb3-245" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-246"><a href="#cb3-246" aria-hidden="true" tabindex="-1"></a>To choose our model parameters, we'll work through the **modeling process**. </span>
+<span id="cb3-247"><a href="#cb3-247" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-248"><a href="#cb3-248" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Choose a model: how should we represent the world?</span>
+<span id="cb3-249"><a href="#cb3-249" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Choose a loss function: how do we quantify prediction error?</span>
+<span id="cb3-250"><a href="#cb3-250" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Fit the model: how do we choose the best parameters of our model given our data?</span>
+<span id="cb3-251"><a href="#cb3-251" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>Evaluate model performance: how do we evaluate whether this process gave rise to a good model?</span>
+<span id="cb3-252"><a href="#cb3-252" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-253"><a href="#cb3-253" aria-hidden="true" tabindex="-1"></a><span class="fu">## Choosing a Model</span></span>
+<span id="cb3-254"><a href="#cb3-254" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-255"><a href="#cb3-255" aria-hidden="true" tabindex="-1"></a>Our first step is choosing a model: defining the mathematical rule that describes the relationship between the features, $x_i$, and predictions $\hat{y}_i$. </span>
+<span id="cb3-256"><a href="#cb3-256" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-257"><a href="#cb3-257" aria-hidden="true" tabindex="-1"></a>In <span class="co">[</span><span class="ot">Data 8</span><span class="co">](https://inferentialthinking.com/chapters/15/4/Least_Squares_Regression.html)</span>, you learned about the **Simple Linear Regression (SLR) model**. You learned that the model takes the form:</span>
+<span id="cb3-258"><a href="#cb3-258" aria-hidden="true" tabindex="-1"></a>$$\hat{y}_i = a + bx_i$$</span>
+<span id="cb3-259"><a href="#cb3-259" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-260"><a href="#cb3-260" aria-hidden="true" tabindex="-1"></a>In Data 100, we'll use slightly different notation: we will replace $a$ with $\theta_0$ and $b$ with $\theta_1$. This will allow us to use the same notation when we explore more complex models later on in the course.</span>
+<span id="cb3-261"><a href="#cb3-261" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-262"><a href="#cb3-262" aria-hidden="true" tabindex="-1"></a>$$\hat{y}_i = \theta_0 + \theta_1 x_i$$</span>
+<span id="cb3-263"><a href="#cb3-263" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-264"><a href="#cb3-264" aria-hidden="true" tabindex="-1"></a>The parameters of the SLR model are $\theta_0$, also called the intercept term, and $\theta_1$, also called the slope term. To create an effective model, we want to choose values for $\theta_0$ and $\theta_1$ that most accurately predict the output variable. The "best" fitting model parameters are given the special names: $\hat{\theta}_0$ and $\hat{\theta}_1$; they are the specific parameter values that allow our model to generate the best possible predictions.</span>
+<span id="cb3-265"><a href="#cb3-265" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-266"><a href="#cb3-266" aria-hidden="true" tabindex="-1"></a>In Data 8, you learned that the best SLR model parameters are:</span>
+<span id="cb3-267"><a href="#cb3-267" aria-hidden="true" tabindex="-1"></a>$$\hat{\theta}_0 = \bar{y} - \hat{\theta}_1\bar{x} \qquad \qquad \hat{\theta}_1 = r \frac{\sigma_y}{\sigma_x}$$</span>
+<span id="cb3-268"><a href="#cb3-268" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-269"><a href="#cb3-269" aria-hidden="true" tabindex="-1"></a>A quick reminder on notation:</span>
+<span id="cb3-270"><a href="#cb3-270" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-271"><a href="#cb3-271" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\bar{y}$ and $\bar{x}$ indicate the mean value of $y$ and $x$, respectively</span>
+<span id="cb3-272"><a href="#cb3-272" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\sigma_y$ and $\sigma_x$ indicate the standard deviations of $y$ and $x$</span>
+<span id="cb3-273"><a href="#cb3-273" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$r$ is the <span class="co">[</span><span class="ot">correlation coefficient</span><span class="co">](https://inferentialthinking.com/chapters/15/1/Correlation.html#the-correlation-coefficient)</span>, defined as the average of the product of $x$ and $y$ measured in standard units: $\frac{1}{n} \sum_{i=1}^n (\frac{x_i-\bar{x}}{\sigma_x})(\frac{y_i-\bar{y}}{\sigma_y})$</span>
+<span id="cb3-274"><a href="#cb3-274" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-275"><a href="#cb3-275" aria-hidden="true" tabindex="-1"></a>In Data 100, we want to understand *how* to derive these best model coefficients. To do so, we'll introduce the concept of a loss function.</span>
+<span id="cb3-276"><a href="#cb3-276" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-277"><a href="#cb3-277" aria-hidden="true" tabindex="-1"></a><span class="fu">## Choosing a Loss Function</span></span>
+<span id="cb3-278"><a href="#cb3-278" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-279"><a href="#cb3-279" aria-hidden="true" tabindex="-1"></a>We've talked about the idea of creating the "best" possible predictions. This begs the question: how do we decide how "good" or "bad" our model's predictions are?</span>
+<span id="cb3-280"><a href="#cb3-280" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-281"><a href="#cb3-281" aria-hidden="true" tabindex="-1"></a>A **loss function** characterizes the cost, error, or fit resulting from a particular choice of model or model parameters. This function, $L(y, \hat{y})$, quantifies how "bad" or "far off" a single prediction by our model is from a true, observed value in our collected data. </span>
+<span id="cb3-282"><a href="#cb3-282" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-283"><a href="#cb3-283" aria-hidden="true" tabindex="-1"></a>The choice of loss function for a particular model will affect the accuracy and computational cost of estimation, and it'll also depend on the estimation task at hand. For example, </span>
+<span id="cb3-284"><a href="#cb3-284" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-285"><a href="#cb3-285" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Are outputs quantitative or qualitative? </span>
+<span id="cb3-286"><a href="#cb3-286" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Do outliers matter? </span>
+<span id="cb3-287"><a href="#cb3-287" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Are all errors equally costly? (e.g., a false negative on a cancer test is arguably more dangerous than a false positive) </span>
+<span id="cb3-288"><a href="#cb3-288" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-289"><a href="#cb3-289" aria-hidden="true" tabindex="-1"></a>Regardless of the specific function used, a loss function should follow two basic principles:</span>
+<span id="cb3-290"><a href="#cb3-290" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-291"><a href="#cb3-291" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>If the prediction $\hat{y}_i$ is *close* to the actual value $y_i$, loss should be low.</span>
+<span id="cb3-292"><a href="#cb3-292" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>If the prediction $\hat{y}_i$ is *far* from the actual value $y_i$, loss should be high.</span>
+<span id="cb3-293"><a href="#cb3-293" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-294"><a href="#cb3-294" aria-hidden="true" tabindex="-1"></a>Two common choices of loss function are squared loss and absolute loss. </span>
+<span id="cb3-295"><a href="#cb3-295" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-296"><a href="#cb3-296" aria-hidden="true" tabindex="-1"></a>**Squared loss**, also known as **L2 loss**, computes loss as the square of the difference between the observed $y_i$ and predicted $\hat{y}_i$:</span>
+<span id="cb3-297"><a href="#cb3-297" aria-hidden="true" tabindex="-1"></a>$$L(y_i, \hat{y}_i) = (y_i - \hat{y}_i)^2$$</span>
+<span id="cb3-298"><a href="#cb3-298" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-299"><a href="#cb3-299" aria-hidden="true" tabindex="-1"></a>**Absolute loss**, also known as **L1 loss**, computes loss as the absolute difference between the observed $y_i$ and predicted $\hat{y}_i$:</span>
+<span id="cb3-300"><a href="#cb3-300" aria-hidden="true" tabindex="-1"></a>$$L(y_i, \hat{y}_i) = |y_i - \hat{y}_i|$$</span>
+<span id="cb3-301"><a href="#cb3-301" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-302"><a href="#cb3-302" aria-hidden="true" tabindex="-1"></a>L1 and L2 loss give us a tool for quantifying our model's performance on a single data point. This is a good start, but ideally, we want to understand how our model performs across our *entire* dataset. A natural way to do this is to compute the average loss across all data points in the dataset. This is known as the **cost function**, $\hat{R}(\theta)$:</span>
+<span id="cb3-303"><a href="#cb3-303" aria-hidden="true" tabindex="-1"></a>$$\hat{R}(\theta) = \frac{1}{n} \sum^n_{i=1} L(y_i, \hat{y}_i)$$</span>
+<span id="cb3-304"><a href="#cb3-304" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-305"><a href="#cb3-305" aria-hidden="true" tabindex="-1"></a>The cost function has many names in the statistics literature. You may also encounter the terms:</span>
+<span id="cb3-306"><a href="#cb3-306" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-307"><a href="#cb3-307" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Empirical risk (this is why we give the cost function the name $R$)</span>
+<span id="cb3-308"><a href="#cb3-308" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Error function</span>
+<span id="cb3-309"><a href="#cb3-309" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Average loss</span>
+<span id="cb3-310"><a href="#cb3-310" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-311"><a href="#cb3-311" aria-hidden="true" tabindex="-1"></a>We can substitute our L1 and L2 loss into the cost function definition. The **Mean Squared Error (MSE)** is the average squared loss across a dataset:</span>
+<span id="cb3-312"><a href="#cb3-312" aria-hidden="true" tabindex="-1"></a>$$\text{MSE} = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2$$</span>
+<span id="cb3-313"><a href="#cb3-313" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-314"><a href="#cb3-314" aria-hidden="true" tabindex="-1"></a>The **Mean Absolute Error (MAE)** is the average absolute loss across a dataset:</span>
+<span id="cb3-315"><a href="#cb3-315" aria-hidden="true" tabindex="-1"></a>$$\text{MAE}= \frac{1}{n} \sum_{i=1}^n |y_i - \hat{y}_i|$$</span>
+<span id="cb3-316"><a href="#cb3-316" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-317"><a href="#cb3-317" aria-hidden="true" tabindex="-1"></a><span class="fu">## Fitting the Model</span></span>
+<span id="cb3-318"><a href="#cb3-318" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-319"><a href="#cb3-319" aria-hidden="true" tabindex="-1"></a>Now that we've established the concept of a loss function, we can return to our original goal of choosing model parameters. Specifically, we want to choose the best set of model parameters that will minimize the model's cost on our dataset. This process is called fitting the model.</span>
+<span id="cb3-320"><a href="#cb3-320" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-321"><a href="#cb3-321" aria-hidden="true" tabindex="-1"></a>We know from calculus that a function is minimized when (1) its first derivative is equal to zero and (2) its second derivative is positive. We often call the function being minimized the **objective function** (our objective is to find its minimum).</span>
+<span id="cb3-322"><a href="#cb3-322" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-323"><a href="#cb3-323" aria-hidden="true" tabindex="-1"></a>To find the optimal model parameter, we:</span>
+<span id="cb3-324"><a href="#cb3-324" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-325"><a href="#cb3-325" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Take the derivative of the cost function with respect to that parameter</span>
+<span id="cb3-326"><a href="#cb3-326" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Set the derivative equal to 0</span>
+<span id="cb3-327"><a href="#cb3-327" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Solve for the parameter</span>
+<span id="cb3-328"><a href="#cb3-328" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-329"><a href="#cb3-329" aria-hidden="true" tabindex="-1"></a>We repeat this process for each parameter present in the model. For now, we'll disregard the second derivative condition. </span>
+<span id="cb3-330"><a href="#cb3-330" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-331"><a href="#cb3-331" aria-hidden="true" tabindex="-1"></a>To help us make sense of this process, let's put it into action by deriving the optimal model parameters for simple linear regression using the mean squared error as our cost function. Remember: although the notation may look tricky, all we are doing is following the three steps above!</span>
+<span id="cb3-332"><a href="#cb3-332" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-333"><a href="#cb3-333" aria-hidden="true" tabindex="-1"></a>Step 1: take the derivative of the cost function with respect to each model parameter. We substitute the SLR model, $\hat{y}_i = \theta_0+\theta_1 x_i$, into the definition of MSE above and differentiate with respect to $\theta_0$ and $\theta_1$.</span>
+<span id="cb3-334"><a href="#cb3-334" aria-hidden="true" tabindex="-1"></a>$$\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 = \frac{1}{n} \sum_{i=1}^{n} (y_i - \theta_0 - \theta_1 x_i)^2$$</span>
+<span id="cb3-335"><a href="#cb3-335" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-336"><a href="#cb3-336" aria-hidden="true" tabindex="-1"></a>$$\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n} y_i - \theta_0 - \theta_1 x_i$$</span>
+<span id="cb3-337"><a href="#cb3-337" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-338"><a href="#cb3-338" aria-hidden="true" tabindex="-1"></a>$$\frac{\partial}{\partial \theta_1} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n} (y_i - \theta_0 - \theta_1 x_i)x_i$$</span>
+<span id="cb3-339"><a href="#cb3-339" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-340"><a href="#cb3-340" aria-hidden="true" tabindex="-1"></a>Let's walk through these derivations in more depth, starting with the derivative of MSE with respect to $\theta_0$.</span>
+<span id="cb3-341"><a href="#cb3-341" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-342"><a href="#cb3-342" aria-hidden="true" tabindex="-1"></a>Given our MSE above, we know that:</span>
+<span id="cb3-343"><a href="#cb3-343" aria-hidden="true" tabindex="-1"></a>$$\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{\partial}{\partial \theta_0} \frac{1}{n} \sum_{i=1}^{n} {(y_i - \theta_0 - \theta_1 x_i)}^{2}$$</span>
+<span id="cb3-344"><a href="#cb3-344" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-345"><a href="#cb3-345" aria-hidden="true" tabindex="-1"></a>Noting that the derivative of sum is equivalent to the sum of derivatives, this then becomes:</span>
+<span id="cb3-346"><a href="#cb3-346" aria-hidden="true" tabindex="-1"></a>$$ = \frac{1}{n} \sum_{i=1}^{n} \frac{\partial}{\partial \theta_0} {(y_i - \theta_0 - \theta_1 x_i)}^{2}$$</span>
+<span id="cb3-347"><a href="#cb3-347" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-348"><a href="#cb3-348" aria-hidden="true" tabindex="-1"></a>We can then apply the chain rule.</span>
+<span id="cb3-349"><a href="#cb3-349" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-350"><a href="#cb3-350" aria-hidden="true" tabindex="-1"></a>$$ = \frac{1}{n} \sum_{i=1}^{n} 2 \cdot{(y_i - \theta_0 - \theta_1 x_i)}\dot(-1)$$</span>
+<span id="cb3-351"><a href="#cb3-351" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-352"><a href="#cb3-352" aria-hidden="true" tabindex="-1"></a>Finally, we can simplify the constants, leaving us with our answer. </span>
+<span id="cb3-353"><a href="#cb3-353" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-354"><a href="#cb3-354" aria-hidden="true" tabindex="-1"></a>$$\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n}{(y_i - \theta_0 - \theta_1 x_i)}$$</span>
+<span id="cb3-355"><a href="#cb3-355" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-356"><a href="#cb3-356" aria-hidden="true" tabindex="-1"></a>Following the same procedure, we can take the derivative of MSE with respect to  $\theta_1$.</span>
+<span id="cb3-357"><a href="#cb3-357" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-358"><a href="#cb3-358" aria-hidden="true" tabindex="-1"></a>$$\frac{\partial}{\partial \theta_1} \text{MSE} = \frac{\partial}{\partial \theta_1} \frac{1}{n} \sum_{i=1}^{n} {(y_i - \theta_0 - \theta_1 x_i)}^{2}$$</span>
+<span id="cb3-359"><a href="#cb3-359" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-360"><a href="#cb3-360" aria-hidden="true" tabindex="-1"></a>$$ = \frac{1}{n} \sum_{i=1}^{n} \frac{\partial}{\partial \theta_1} {(y_i - \theta_0 - \theta_1 x_i)}^{2}$$</span>
+<span id="cb3-361"><a href="#cb3-361" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-362"><a href="#cb3-362" aria-hidden="true" tabindex="-1"></a>$$ = \frac{1}{n} \sum_{i=1}^{n} 2 \dot{(y_i - \theta_0 - \theta_1 x_i)}\dot(-x_i)$$</span>
+<span id="cb3-363"><a href="#cb3-363" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-364"><a href="#cb3-364" aria-hidden="true" tabindex="-1"></a>$$= \frac{-2}{n} \sum_{i=1}^{n} {(y_i - \theta_0 - \theta_1 x_i)}x_i$$</span>
+<span id="cb3-365"><a href="#cb3-365" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-366"><a href="#cb3-366" aria-hidden="true" tabindex="-1"></a>Step 2: set the derivatives equal to 0. After simplifying terms, this produces two **estimating equations**. The best set of model parameters $(\hat{\theta}_0, \hat{\theta}_1)$ *must* satisfy these two optimality conditions.</span>
+<span id="cb3-367"><a href="#cb3-367" aria-hidden="true" tabindex="-1"></a>$$0 = \frac{-2}{n} \sum_{i=1}^{n} y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i \Longleftrightarrow \frac{1}{n}\sum_{i=1}^{n} y_i - \hat{y}_i = 0$$</span>
+<span id="cb3-368"><a href="#cb3-368" aria-hidden="true" tabindex="-1"></a>$$0 = \frac{-2}{n} \sum_{i=1}^{n} (y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i)x_i \Longleftrightarrow \frac{1}{n}\sum_{i=1}^{n} (y_i - \hat{y}_i)x_i = 0$$</span>
+<span id="cb3-369"><a href="#cb3-369" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-370"><a href="#cb3-370" aria-hidden="true" tabindex="-1"></a>Step 3: solve the estimating equations to compute estimates for $\hat{\theta}_0$ and $\hat{\theta}_1$.</span>
+<span id="cb3-371"><a href="#cb3-371" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-372"><a href="#cb3-372" aria-hidden="true" tabindex="-1"></a>Taking the first equation gives the estimate of $\hat{\theta}_0$:</span>
+<span id="cb3-373"><a href="#cb3-373" aria-hidden="true" tabindex="-1"></a>$$\frac{1}{n} \sum_{i=1}^n y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i = 0 $$ </span>
+<span id="cb3-374"><a href="#cb3-374" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-375"><a href="#cb3-375" aria-hidden="true" tabindex="-1"></a>$$\left(\frac{1}{n} \sum_{i=1}^n y_i \right) - \hat{\theta}_0 - \hat{\theta}_1\left(\frac{1}{n} \sum_{i=1}^n x_i \right) = 0$$</span>
+<span id="cb3-376"><a href="#cb3-376" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-377"><a href="#cb3-377" aria-hidden="true" tabindex="-1"></a>$$ \hat{\theta}_0 = \bar{y} - \hat{\theta}_1 \bar{x}$$</span>
+<span id="cb3-378"><a href="#cb3-378" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-379"><a href="#cb3-379" aria-hidden="true" tabindex="-1"></a>With a bit more maneuvering, the second equation gives the estimate of $\hat{\theta}_1$. Start by multiplying the first estimating equation by $\bar{x}$, then subtracting the result from the second estimating equation.</span>
+<span id="cb3-380"><a href="#cb3-380" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-381"><a href="#cb3-381" aria-hidden="true" tabindex="-1"></a>$$\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)x_i - \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)\bar{x} = 0 $$</span>
+<span id="cb3-382"><a href="#cb3-382" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-383"><a href="#cb3-383" aria-hidden="true" tabindex="-1"></a>$$\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)(x_i - \bar{x}) = 0 $$</span>
+<span id="cb3-384"><a href="#cb3-384" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-385"><a href="#cb3-385" aria-hidden="true" tabindex="-1"></a>Next, plug in $\hat{y}_i = \hat{\theta}_0 + \hat{\theta}_1 x_i = \bar{y} + \hat{\theta}_1(x_i - \bar{x})$:</span>
+<span id="cb3-386"><a href="#cb3-386" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-387"><a href="#cb3-387" aria-hidden="true" tabindex="-1"></a>$$\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y} - \hat{\theta}_1(x - \bar{x}))(x_i - \bar{x}) = 0 $$</span>
+<span id="cb3-388"><a href="#cb3-388" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-389"><a href="#cb3-389" aria-hidden="true" tabindex="-1"></a>$$\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y})(x_i - \bar{x}) = \hat{\theta}_1 \times \frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2</span>
+<span id="cb3-390"><a href="#cb3-390" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb3-391"><a href="#cb3-391" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-392"><a href="#cb3-392" aria-hidden="true" tabindex="-1"></a>By using the definition of correlation $\left(r = \frac{1}{n} \sum_{i=1}^n (\frac{x_i-\bar{x}}{\sigma_x})(\frac{y_i-\bar{y}}{\sigma_y}) \right)$ and standard deviation $\left(\sigma_x = \sqrt{\frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2} \right)$, we can conclude:</span>
+<span id="cb3-393"><a href="#cb3-393" aria-hidden="true" tabindex="-1"></a>$$r \sigma_x \sigma_y = \hat{\theta}_1 \times \sigma_x^2$$</span>
+<span id="cb3-394"><a href="#cb3-394" aria-hidden="true" tabindex="-1"></a>$$\hat{\theta}_1 = r \frac{\sigma_y}{\sigma_x}$$</span>
+<span id="cb3-395"><a href="#cb3-395" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-396"><a href="#cb3-396" aria-hidden="true" tabindex="-1"></a>Just as was given in Data 8! </span>
+<span id="cb3-397"><a href="#cb3-397" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-398"><a href="#cb3-398" aria-hidden="true" tabindex="-1"></a>Remember, this derivation found the optimal model parameters for SLR when using the MSE cost function. If we had used a different model or different loss function, we likely would have found different values for the best model parameters. However, regardless of the model and loss used, we can *always* follow these three steps to fit the model.</span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/intro_to_modeling/intro_to_modeling_files/figure-html/cell-2-output-1.png b/docs/intro_to_modeling/intro_to_modeling_files/figure-html/cell-2-output-1.png
new file mode 100644
index 000000000..b6767dada
Binary files /dev/null and b/docs/intro_to_modeling/intro_to_modeling_files/figure-html/cell-2-output-1.png differ
diff --git a/docs/intro_to_modeling/intro_to_modeling_files/figure-html/cell-3-output-1.png b/docs/intro_to_modeling/intro_to_modeling_files/figure-html/cell-3-output-1.png
new file mode 100644
index 000000000..77b58e065
Binary files /dev/null and b/docs/intro_to_modeling/intro_to_modeling_files/figure-html/cell-3-output-1.png differ
diff --git a/docs/logistic_regression_1/images/class.png b/docs/logistic_regression_1/images/class.png
new file mode 100644
index 000000000..789cb6cee
Binary files /dev/null and b/docs/logistic_regression_1/images/class.png differ
diff --git a/docs/logistic_regression_1/images/global_local_min.png b/docs/logistic_regression_1/images/global_local_min.png
new file mode 100644
index 000000000..60ed16231
Binary files /dev/null and b/docs/logistic_regression_1/images/global_local_min.png differ
diff --git a/docs/logistic_regression_1/images/log_reg.png b/docs/logistic_regression_1/images/log_reg.png
new file mode 100644
index 000000000..5b2696358
Binary files /dev/null and b/docs/logistic_regression_1/images/log_reg.png differ
diff --git a/docs/logistic_regression_1/images/reg.png b/docs/logistic_regression_1/images/reg.png
new file mode 100644
index 000000000..b243065b2
Binary files /dev/null and b/docs/logistic_regression_1/images/reg.png differ
diff --git a/docs/logistic_regression_1/images/squared_loss.png b/docs/logistic_regression_1/images/squared_loss.png
new file mode 100644
index 000000000..2f3fc075b
Binary files /dev/null and b/docs/logistic_regression_1/images/squared_loss.png differ
diff --git a/docs/logistic_regression_1/images/y=0.png b/docs/logistic_regression_1/images/y=0.png
new file mode 100644
index 000000000..3671d0062
Binary files /dev/null and b/docs/logistic_regression_1/images/y=0.png differ
diff --git a/docs/logistic_regression_1/images/y=1.png b/docs/logistic_regression_1/images/y=1.png
new file mode 100644
index 000000000..c883d2fbe
Binary files /dev/null and b/docs/logistic_regression_1/images/y=1.png differ
diff --git a/docs/logistic_regression_1/logistic_reg_1.html b/docs/logistic_regression_1/logistic_reg_1.html
new file mode 100644
index 000000000..c56b793d6
--- /dev/null
+++ b/docs/logistic_regression_1/logistic_reg_1.html
@@ -0,0 +1,1566 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>22&nbsp; Logistic Regression I – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../logistic_regression_2/logistic_reg_2.html" rel="next">
+<link href="../sql_II/sql_II.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../logistic_regression_1/logistic_reg_1.html"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#classification" id="toc-classification" class="nav-link active" data-scroll-target="#classification"><span class="header-section-number">22.1</span> Classification</a>
+  <ul>
+  <li><a href="#modeling-process" id="toc-modeling-process" class="nav-link" data-scroll-target="#modeling-process"><span class="header-section-number">22.1.1</span> Modeling Process</a></li>
+  </ul></li>
+  <li><a href="#deriving-the-logistic-regression-model" id="toc-deriving-the-logistic-regression-model" class="nav-link" data-scroll-target="#deriving-the-logistic-regression-model"><span class="header-section-number">22.2</span> Deriving the Logistic Regression Model</a>
+  <ul>
+  <li><a href="#graph-of-averages" id="toc-graph-of-averages" class="nav-link" data-scroll-target="#graph-of-averages"><span class="header-section-number">22.2.1</span> Graph of Averages</a></li>
+  <li><a href="#handling-non-linear-output" id="toc-handling-non-linear-output" class="nav-link" data-scroll-target="#handling-non-linear-output"><span class="header-section-number">22.2.2</span> Handling Non-Linear Output</a>
+  <ul>
+  <li><a href="#odds" id="toc-odds" class="nav-link" data-scroll-target="#odds"><span class="header-section-number">22.2.2.1</span> 1. Odds</a></li>
+  <li><a href="#log" id="toc-log" class="nav-link" data-scroll-target="#log"><span class="header-section-number">22.2.2.2</span> 2. Log</a></li>
+  <li><a href="#putting-it-together" id="toc-putting-it-together" class="nav-link" data-scroll-target="#putting-it-together"><span class="header-section-number">22.2.2.3</span> 3. Putting it Together</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#the-logistic-regression-model" id="toc-the-logistic-regression-model" class="nav-link" data-scroll-target="#the-logistic-regression-model"><span class="header-section-number">22.3</span> The Logistic Regression Model</a></li>
+  <li><a href="#cross-entropy-loss" id="toc-cross-entropy-loss" class="nav-link" data-scroll-target="#cross-entropy-loss"><span class="header-section-number">22.4</span> Cross-Entropy Loss</a>
+  <ul>
+  <li><a href="#why-not-mse" id="toc-why-not-mse" class="nav-link" data-scroll-target="#why-not-mse"><span class="header-section-number">22.4.1</span> Why Not MSE?</a></li>
+  <li><a href="#motivating-cross-entropy-loss" id="toc-motivating-cross-entropy-loss" class="nav-link" data-scroll-target="#motivating-cross-entropy-loss"><span class="header-section-number">22.4.2</span> Motivating Cross-Entropy Loss</a></li>
+  </ul></li>
+  <li><a href="#maximum-likelihood-estimation" id="toc-maximum-likelihood-estimation" class="nav-link" data-scroll-target="#maximum-likelihood-estimation"><span class="header-section-number">22.5</span> Maximum Likelihood Estimation</a>
+  <ul>
+  <li><a href="#building-intuition-the-coin-flip" id="toc-building-intuition-the-coin-flip" class="nav-link" data-scroll-target="#building-intuition-the-coin-flip"><span class="header-section-number">22.5.1</span> Building Intuition: The Coin Flip</a></li>
+  <li><a href="#likelihood-of-data" id="toc-likelihood-of-data" class="nav-link" data-scroll-target="#likelihood-of-data"><span class="header-section-number">22.5.2</span> Likelihood of Data</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Understand the difference between regression and classification</li>
+<li>Derive the logistic regression model for classifying data</li>
+<li>Quantify the error of our logistic regression model with cross-entropy loss</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Up until this point in the class , we’ve focused on <strong>regression</strong> tasks - that is, predicting an <em>unbounded numerical quantity</em> from a given dataset. We discussed optimization, feature engineering, and regularization all in the context of performing regression to predict some quantity.</p>
+<p>Now that we have this deep understanding of the modeling process, let’s expand our knowledge of possible modeling tasks.</p>
+<section id="classification" class="level2" data-number="22.1">
+<h2 data-number="22.1" class="anchored" data-anchor-id="classification"><span class="header-section-number">22.1</span> Classification</h2>
+<p>In the next two lectures, we’ll tackle the task of <strong>classification</strong>. A classification problem aims to classify data into <em>categories</em>. Unlike in regression, where we predicted a numeric output, classification involves predicting some <strong>categorical variable</strong>, or <strong>response</strong>, <span class="math inline">\(y\)</span>. Examples of classification tasks include:</p>
+<ul>
+<li>Predicting which team won from its turnover percentage</li>
+<li>Predicting the day of the week of a meal from the total restaurant bill</li>
+<li>Predicting the model of car from its horsepower</li>
+</ul>
+<p>There are a couple of different types of classification:</p>
+<ul>
+<li><strong>Binary classification</strong>: classify data into two classes, and responses <span class="math inline">\(y\)</span> are either 0 or 1</li>
+<li><strong>Multiclass classification</strong>: classify data into multiple classes (e.g., image labeling, next word in a sentence, etc.)</li>
+</ul>
+<p>We can further combine multiple related classfication predictions (e.g., translation, voice recognition, etc.) to tackle complex problems through structured prediction tasks.</p>
+<p>In Data 100, we will mostly deal with <strong>binary classification</strong>, where we are attempting to classify data into one of two classes.</p>
+<section id="modeling-process" class="level3" data-number="22.1.1">
+<h3 data-number="22.1.1" class="anchored" data-anchor-id="modeling-process"><span class="header-section-number">22.1.1</span> Modeling Process</h3>
+<p>To build a classification model, we need to modify our modeling workflow slightly. Recall that in regression we:</p>
+<ol type="1">
+<li>Created a design matrix of numeric features</li>
+<li>Defined our model as a linear combination of these numeric features</li>
+<li>Used the model to output numeric predictions</li>
+</ol>
+<p>In classification, however, we no longer want to output numeric predictions; instead, we want to predict the class to which a datapoint belongs. This means that we need to update our workflow. To build a classification model, we will:</p>
+<ol type="1">
+<li>Create a design matrix of numeric features.</li>
+<li>Define our model as a linear combination of these numeric features, transformed by a non-linear <strong>sigmoid function</strong>. This outputs a numeric quantity.</li>
+<li>Apply a <strong>decision rule</strong> to interpret the outputted quantity and decide a classification.</li>
+<li>Output a predicted class.</li>
+</ol>
+<p>There are two key differences: as we’ll soon see, we need to incorporate a non-linear transformation to capture the non-linear relationships hidden in our data. We do so by applying the sigmoid function to a linear combination of the features. Secondly, we must apply a decision rule to convert the numeric quantities computed by our model into an actual class prediction. This can be as simple as saying that any datapoint with a feature greater than some number <span class="math inline">\(x\)</span> belongs to Class 1.</p>
+<p><strong>Regression:</strong></p>
+<center>
+<img src="images/reg.png" alt="reg" width="750">
+</center>
+<p><strong>Classification:</strong></p>
+<center>
+<img src="images/class.png" alt="class" width="750">
+</center>
+<p>This was a very high-level overview. Let’s walk through the process in detail to clarify what we mean.</p>
+</section>
+</section>
+<section id="deriving-the-logistic-regression-model" class="level2" data-number="22.2">
+<h2 data-number="22.2" class="anchored" data-anchor-id="deriving-the-logistic-regression-model"><span class="header-section-number">22.2</span> Deriving the Logistic Regression Model</h2>
+<p>Throughout this lecture, we will work with the <code>games</code> dataset, which contains information about games played in the NBA basketball league. Our goal will be to use a basketball team’s <code>"GOAL_DIFF"</code> to predict whether or not a given team won their game (<code>"WON"</code>). If a team wins their game, we’ll say they belong to Class 1. If they lose, they belong to Class 0.</p>
+<p>For those who are curious, <code>"GOAL_DIFF"</code> represents the difference in successful field goal percentages between the two competing teams.</p>
+<div id="fcdfb9df" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> warnings</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>warnings.filterwarnings(<span class="st">"ignore"</span>)</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>np.seterr(divide<span class="op">=</span><span class="st">'ignore'</span>)</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a>games <span class="op">=</span> pd.read_csv(<span class="st">"data/games"</span>).dropna()</span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a>games.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="1">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">GAME_ID</th>
+<th data-quarto-table-cell-role="th">TEAM_NAME</th>
+<th data-quarto-table-cell-role="th">MATCHUP</th>
+<th data-quarto-table-cell-role="th">WON</th>
+<th data-quarto-table-cell-role="th">GOAL_DIFF</th>
+<th data-quarto-table-cell-role="th">AST</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>21701216</td>
+<td>Dallas Mavericks</td>
+<td>DAL vs. PHX</td>
+<td>0</td>
+<td>-0.251</td>
+<td>20</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>21700846</td>
+<td>Phoenix Suns</td>
+<td>PHX @ GSW</td>
+<td>0</td>
+<td>-0.237</td>
+<td>13</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>21700071</td>
+<td>San Antonio Spurs</td>
+<td>SAS @ ORL</td>
+<td>0</td>
+<td>-0.234</td>
+<td>19</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>21700221</td>
+<td>New York Knicks</td>
+<td>NYK @ TOR</td>
+<td>0</td>
+<td>-0.234</td>
+<td>17</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>21700306</td>
+<td>Miami Heat</td>
+<td>MIA @ NYK</td>
+<td>0</td>
+<td>-0.222</td>
+<td>21</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Let’s visualize the relationship between <code>"GOAL_DIFF"</code> and <code>"WON"</code> using the Seaborn function <code>sns.stripplot</code>. A strip plot automatically introduces a small amount of random noise to <strong>jitter</strong> the data. Recall that all values in the <code>"WON"</code> column are either 1 (won) or 0 (lost) – if we were to directly plot them without jittering, we would see severe overplotting.</p>
+<div id="e6636951" class="cell" data-execution_count="2">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>sns.stripplot(data<span class="op">=</span>games, x<span class="op">=</span><span class="st">"GOAL_DIFF"</span>, y<span class="op">=</span><span class="st">"WON"</span>, orient<span class="op">=</span><span class="st">"h"</span>, hue<span class="op">=</span><span class="st">'WON'</span>, alpha<span class="op">=</span><span class="fl">0.7</span>)</span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="co"># By default, sns.stripplot plots 0, then 1. We invert the y axis to reverse this behavior</span></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>plt.gca().invert_yaxis()<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="logistic_reg_1_files/figure-html/cell-3-output-1.png" width="580" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>This dataset is unlike anything we’ve seen before – our target variable contains only two unique values! (Remember that each y value is either 0 or 1; the plot above jitters the y data slightly for ease of reading.)</p>
+<p>The regression models we have worked with always assumed that we were attempting to predict a continuous target. If we apply a linear regression model to this dataset, something strange happens.</p>
+<div id="f4ff4c3a" class="cell" data-execution_count="3">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.linear_model <span class="im">as</span> lm</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>X, Y <span class="op">=</span> games[[<span class="st">"GOAL_DIFF"</span>]], games[<span class="st">"WON"</span>]</span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>regression_model <span class="op">=</span> lm.LinearRegression()</span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>regression_model.fit(X, Y)</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>plt.plot(X.squeeze(), regression_model.predict(X), <span class="st">"k"</span>)</span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>sns.stripplot(data<span class="op">=</span>games, x<span class="op">=</span><span class="st">"GOAL_DIFF"</span>, y<span class="op">=</span><span class="st">"WON"</span>, orient<span class="op">=</span><span class="st">"h"</span>, hue<span class="op">=</span><span class="st">'WON'</span>, alpha<span class="op">=</span><span class="fl">0.7</span>)</span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>plt.gca().invert_yaxis()<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="logistic_reg_1_files/figure-html/cell-4-output-1.png" width="580" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>The linear regression fit follows the data as closely as it can. However, this approach has a key flaw - the predicted output, <span class="math inline">\(\hat{y}\)</span>, can be outside the range of possible classes (there are predictions above 1 and below 0). This means that the output can’t always be interpreted (what does it mean to predict a class of -2.3?).</p>
+<p>Our usual linear regression framework won’t work here. Instead, we’ll need to get more creative.</p>
+<section id="graph-of-averages" class="level3" data-number="22.2.1">
+<h3 data-number="22.2.1" class="anchored" data-anchor-id="graph-of-averages"><span class="header-section-number">22.2.1</span> Graph of Averages</h3>
+<p>Back in <a href="https://inferentialthinking.com/chapters/08/1/Applying_a_Function_to_a_Column.html#example-prediction">Data 8</a>, you gradually built up to the concept of linear regression by using the <strong>graph of averages</strong>. Before you knew the mathematical underpinnings of the regression line, you took a more intuitive approach: you bucketed the <span class="math inline">\(x\)</span> data into bins of common values, then computed the average <span class="math inline">\(y\)</span> for all datapoints in the same bin. The result gave you the insight needed to derive the regression fit.</p>
+<p>Let’s take the same approach as we grapple with our new classification task. In the cell below, we 1) bucket the <code>"GOAL_DIFF"</code> data into bins of similar values and 2) compute the average <code>"WON"</code> value of all datapoints in a bin.</p>
+<div id="a26e01c8" class="cell" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># bucket the GOAL_DIFF data into 20 bins</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>bins <span class="op">=</span> pd.cut(games[<span class="st">"GOAL_DIFF"</span>], <span class="dv">20</span>)</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>games[<span class="st">"bin"</span>] <span class="op">=</span> [(b.left <span class="op">+</span> b.right) <span class="op">/</span> <span class="dv">2</span> <span class="cf">for</span> b <span class="kw">in</span> bins]</span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>win_rates_by_bin <span class="op">=</span> games.groupby(<span class="st">"bin"</span>)[<span class="st">"WON"</span>].mean()</span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="co"># plot the graph of averages</span></span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>sns.stripplot(data<span class="op">=</span>games, x<span class="op">=</span><span class="st">"GOAL_DIFF"</span>, y<span class="op">=</span><span class="st">"WON"</span>, orient<span class="op">=</span><span class="st">"h"</span>, alpha<span class="op">=</span><span class="fl">0.5</span>, hue<span class="op">=</span><span class="st">'WON'</span>) <span class="co"># alpha makes the points transparent</span></span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>plt.plot(win_rates_by_bin.index, win_rates_by_bin, c<span class="op">=</span><span class="st">"tab:red"</span>)</span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>plt.gca().invert_yaxis()<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="logistic_reg_1_files/figure-html/cell-5-output-1.png" width="580" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Interesting: our result is certainly not like the straight line produced by finding the graph of averages for a linear relationship. We can make two observations:</p>
+<ul>
+<li>All predictions on our line are between 0 and 1</li>
+<li>The predictions are <strong>non-linear</strong>, following a rough “S” shape</li>
+</ul>
+<p>Let’s think more about what we’ve just done.</p>
+<p>To find the average <span class="math inline">\(y\)</span> value for each bin, we computed:</p>
+<p><span class="math display">\[\frac{1 \text{(\# Y = 1 in bin)} + 0 \text{(\# Y = 0 in bin)}}{\text{\# datapoints in bin}} = \frac{\text{\# Y = 1 in bin}}{\text{\# datapoints in bin}} = P(\text{Y = 1} | \text{bin})\]</span></p>
+<p>This is simply the probability of a datapoint in that bin belonging to Class 1! This aligns with our observation from earlier: all of our predictions lie between 0 and 1, just as we would expect for a probability.</p>
+<p>Our graph of averages was really modeling the probability, <span class="math inline">\(p\)</span>, that a datapoint belongs to Class 1, or essentially that <span class="math inline">\(\text{Y = 1}\)</span> for a particular value of <span class="math inline">\(\text{x}\)</span>.</p>
+<p><span class="math display">\[ p = P(Y = 1 | \text{ x} )\]</span></p>
+<p>In logistic regression, we have a new modeling goal. We want to model the <strong>probability that a particular datapoint belongs to Class 1</strong> by approximating the S-shaped curve we plotted above. However, we’ve only learned about linear modeling techniques like Linear Regression and OLS.</p>
+</section>
+<section id="handling-non-linear-output" class="level3" data-number="22.2.2">
+<h3 data-number="22.2.2" class="anchored" data-anchor-id="handling-non-linear-output"><span class="header-section-number">22.2.2</span> Handling Non-Linear Output</h3>
+<p>Fortunately for us, we’re already well-versed with a technique to model non-linear relationships – we can apply non-linear transformations like log or exponents to make a non-linear relationship more linear. Recall the steps we’ve applied previously:</p>
+<ul>
+<li>Transform the variables until we linearize their relationship</li>
+<li>Fit a linear model to the transformed variables</li>
+<li>“Undo” our transformations to identify the underlying relationship between the original variables</li>
+</ul>
+<p>In past examples, we used the bulge diagram to help us decide what transformations may be useful. The S-shaped curve we saw above, however, looks nothing like any relationship we’ve seen in the past. We’ll need to think carefully about what transformations will linearize this curve.</p>
+<section id="odds" class="level4" data-number="22.2.2.1">
+<h4 data-number="22.2.2.1" class="anchored" data-anchor-id="odds"><span class="header-section-number">22.2.2.1</span> 1. Odds</h4>
+<p>Let’s consider our eventual goal: determining if we should predict a Class of 0 or 1 for each datapoint. Rephrased, we want to decide if it seems more “likely” that the datapoint belongs to Class 0 or to Class 1. One way of deciding this is to see which class has the higher predicted probability for a given datapoint. The <strong>odds</strong> is defined as the probability of a datapoint belonging to Class 1 divided by the probability of it belonging to Class 0.</p>
+<p><span class="math display">\[\text{odds} = \frac{P(Y=1|x)}{P(Y=0|x)} = \frac{p}{1-p}\]</span></p>
+<p>If we plot the odds for each input <code>"GOAL_DIFF"</code> (<span class="math inline">\(x\)</span>), we see something that looks more promising.</p>
+<div id="db7c5877" class="cell" data-execution_count="5">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>p <span class="op">=</span> win_rates_by_bin</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>odds <span class="op">=</span> p<span class="op">/</span>(<span class="dv">1</span><span class="op">-</span>p) </span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>plt.plot(odds.index, odds)</span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"x"</span>)</span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="vs">r"Odds $= \frac</span><span class="sc">{p}</span><span class="vs">{1-p}$"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="logistic_reg_1_files/figure-html/cell-6-output-1.png" width="595" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="log" class="level4" data-number="22.2.2.2">
+<h4 data-number="22.2.2.2" class="anchored" data-anchor-id="log"><span class="header-section-number">22.2.2.2</span> 2. Log</h4>
+<p>It turns out that the relationship between our input <code>"GOAL_DIFF"</code> and the odds is roughly exponential! Let’s linearize the exponential by taking the logarithm (as suggested by the <a href="https://ds100.org/course-notes/visualization_2/visualization_2.html#tukey-mosteller-bulge-diagram">Tukey-Mosteller Bulge Diagram</a>). As a reminder, you should assume that any logarithm in Data 100 is the base <span class="math inline">\(e\)</span> natural logarithm unless told otherwise.</p>
+<div id="651c74b1" class="cell" data-execution_count="6">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>log_odds <span class="op">=</span> np.log(odds)</span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>plt.plot(odds.index, log_odds, c<span class="op">=</span><span class="st">"tab:green"</span>)</span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"x"</span>)</span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="vs">r"Log-Odds $= \log{\frac</span><span class="sc">{p}</span><span class="vs">{1-p</span><span class="sc">}}</span><span class="vs">$"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="logistic_reg_1_files/figure-html/cell-7-output-1.png" width="597" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="putting-it-together" class="level4" data-number="22.2.2.3">
+<h4 data-number="22.2.2.3" class="anchored" data-anchor-id="putting-it-together"><span class="header-section-number">22.2.2.3</span> 3. Putting it Together</h4>
+<p>We see something promising – the relationship between the log-odds and our input feature is approximately linear. This means that we can use a linear model to describe the relationship between the log-odds and <span class="math inline">\(x\)</span>. In other words:</p>
+<p><span class="math display">\[\begin{align}
+\log{(\frac{p}{1-p})} &amp;= \theta_0 + \theta_1 x_1 + ... + \theta_p x_p\\
+&amp;= x^{\top} \theta
+\end{align}\]</span></p>
+<p>Here, we use <span class="math inline">\(x^{\top}\)</span> to represent an observation in our dataset, stored as a row vector. You can imagine it as a single row in our design matrix. <span class="math inline">\(x^{\top} \theta\)</span> indicates a linear combination of the features for this observation (just as we used in multiple linear regression).</p>
+<p>We’re in good shape! We have now derived the following relationship:</p>
+<p><span class="math display">\[\log{(\frac{p}{1-p})} = x^{\top} \theta\]</span></p>
+<p>Remember that our goal is to predict the probability of a datapoint belonging to Class 1, <span class="math inline">\(p\)</span>. Let’s rearrange this relationship to uncover the original relationship between <span class="math inline">\(p\)</span> and our input data, <span class="math inline">\(x^{\top}\)</span>.</p>
+<p><span class="math display">\[\begin{align}
+\log{(\frac{p}{1-p})} &amp;= x^T \theta\\
+\frac{p}{1-p} &amp;= e^{x^T \theta}\\
+p &amp;= (1-p)e^{x^T \theta}\\
+p &amp;= e^{x^T \theta}- p e^{x^T \theta}\\
+p(1 + e^{x^T \theta}) &amp;= e^{x^T \theta} \\
+p &amp;= \frac{e^{x^T \theta}}{1+e^{x^T \theta}}\\
+p &amp;= \frac{1}{1+e^{-x^T \theta}}\\
+\end{align}\]</span></p>
+<p>Phew, that was a lot of algebra. What we’ve uncovered is the <strong>logistic regression model</strong> to predict the probability of a datapoint <span class="math inline">\(x^{\top}\)</span> belonging to Class 1. If we plot this relationship for our data, we see the S-shaped curve from earlier!</p>
+<div id="c7d20a33" class="cell" data-execution_count="7">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># We'll discuss the `LogisticRegression` class next time</span></span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>xs <span class="op">=</span> np.linspace(<span class="op">-</span><span class="fl">0.3</span>, <span class="fl">0.3</span>)</span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>logistic_model <span class="op">=</span> lm.LogisticRegression(C<span class="op">=</span><span class="dv">20</span>)</span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>logistic_model.fit(X, Y)</span>
+<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>predicted_prob <span class="op">=</span> logistic_model.predict_proba(xs[:, np.newaxis])[:, <span class="dv">1</span>]</span>
+<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>sns.stripplot(data<span class="op">=</span>games, x<span class="op">=</span><span class="st">"GOAL_DIFF"</span>, y<span class="op">=</span><span class="st">"WON"</span>, orient<span class="op">=</span><span class="st">"h"</span>, alpha<span class="op">=</span><span class="fl">0.5</span>)</span>
+<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>plt.plot(xs, predicted_prob, c<span class="op">=</span><span class="st">"k"</span>, lw<span class="op">=</span><span class="dv">3</span>, label<span class="op">=</span><span class="st">"Logistic regression model"</span>)</span>
+<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>plt.plot(win_rates_by_bin.index, win_rates_by_bin, lw<span class="op">=</span><span class="dv">2</span>, c<span class="op">=</span><span class="st">"tab:red"</span>, label<span class="op">=</span><span class="st">"Graph of averages"</span>)</span>
+<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>plt.legend(loc<span class="op">=</span><span class="st">"upper left"</span>)</span>
+<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a>plt.gca().invert_yaxis()<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="logistic_reg_1_files/figure-html/cell-8-output-1.png" width="576" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>The S-shaped curve is formally known as the <strong>sigmoid function</strong> and is typically denoted by <span class="math inline">\(\sigma\)</span>.</p>
+<p><span class="math display">\[\sigma(t) = \frac{1}{1+e^{-t}}\]</span></p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Properties of the Sigmoid
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<ul>
+<li>Reflection/Symmetry: <span class="math display">\[1-\sigma(t) = \frac{e^{-t}}{1+e^{-t}}=\sigma(-t)\]</span></li>
+<li>Inverse: <span class="math display">\[t=\sigma^{-1}(p)=\log{(\frac{p}{1-p})}\]</span></li>
+<li>Derivative: <span class="math display">\[\frac{d}{dz} \sigma(t) = \sigma(t) (1-\sigma(t))=\sigma(t)\sigma(-t)\]</span></li>
+<li>Domain: <span class="math inline">\(-\infty &lt; t &lt; \infty\)</span></li>
+<li>Range: <span class="math inline">\(0 &lt; \sigma(t) &lt; 1\)</span></li>
+</ul>
+</div>
+</div>
+<p>In the context of our modeling process, the sigmoid is considered an <strong>activation function</strong>. It takes in a linear combination of the features and applies a non-linear transformation.</p>
+</section>
+</section>
+</section>
+<section id="the-logistic-regression-model" class="level2" data-number="22.3">
+<h2 data-number="22.3" class="anchored" data-anchor-id="the-logistic-regression-model"><span class="header-section-number">22.3</span> The Logistic Regression Model</h2>
+<p>To predict a probability using the logistic regression model, we:</p>
+<ol type="1">
+<li>Compute a linear combination of the features, <span class="math inline">\(x^{\top}\theta\)</span></li>
+<li>Apply the sigmoid activation function, <span class="math inline">\(\sigma(x^{\top} \theta)\)</span>.</li>
+</ol>
+<p>Our predicted probabilities are of the form <span class="math inline">\(P(Y=1|x) = p = \frac{1}{1+e^{-x^T \theta}}  = \frac{1}{1+e^{-(\theta_0 + \theta_1 x_1 + \theta_2 x_2 + \ldots + \theta_p x_p)}}\)</span></p>
+<p>An important note: despite its name, logistic regression is used for <em>classification</em> tasks, not regression tasks. In Data 100, we always apply logistic regression with the goal of classifying data.</p>
+<p>Let’s summarize our logistic regression modeling workflow:</p>
+<center>
+<img src="images/log_reg.png" alt="log_reg" width="750">
+</center>
+<p>Our main takeaways from this section are:</p>
+<ul>
+<li>Assume log-odds is a linear combination of <span class="math inline">\(x\)</span> and <span class="math inline">\(\theta\)</span></li>
+<li>Fit the “S” curve as best as possible</li>
+<li>The curve models the probability: <span class="math inline">\(P = (Y=1 | x)\)</span></li>
+</ul>
+<p>Putting this together, we know that the estimated probability that response is 1 given the features <span class="math inline">\(x\)</span> is equal to the logistic function <span class="math inline">\(\sigma()\)</span> at the value <span class="math inline">\(x^{\top}\theta\)</span>:</p>
+<p><span class="math display">\[\begin{align}
+\hat{P}_{\theta}(Y = 1 | x) = \frac{1}{1 + e^{-x^{\top}\theta}}
+\end{align}\]</span></p>
+<p>More commonly, the logistic regression model is written as:</p>
+<p><span class="math display">\[\begin{align}
+\hat{P}_{\theta}(Y = 1 | x) = \sigma(x^{\top}\theta)
+\end{align}\]</span></p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Properties of the Logistic Model
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Consider a logistic regression model with one feature and an intercept term:</p>
+<p><span class="math display">\[\begin{align}
+p = P(Y = 1 | x) = \frac{1}{1+e^{-(\theta_0 + \theta_1 x)}}
+\end{align}\]</span></p>
+<p>Properties:</p>
+<ul>
+<li><span class="math inline">\(\theta_0\)</span> controls the position of the curve along the horizontal axis</li>
+<li>The magnitude of <span class="math inline">\(\theta_1\)</span> controls the “steepness” of the sigmoid</li>
+<li>The sign of <span class="math inline">\(\theta_1\)</span> controls the orientation of the curve</li>
+</ul>
+</div>
+</div>
+<div class="callout callout-style-simple callout-none no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Example Calculation
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Suppose we want to predict the probability that a team wins a game, given <code>"GOAL_DIFF"</code> (first feature) and the number of free throws (second feature). Let’s say we fit a logistic regression model (with no intercept) using the training data and estimate the optimal parameters. Now we want to predict the probability that a new team will win their game.</p>
+<p><span class="math display">\[\begin{align}
+\hat{\theta}^{\top} = \begin{matrix}[0.1 &amp; -0.5]\end{matrix}
+\\x^{\top} = \begin{matrix}[15 &amp; 1]\end{matrix}
+\end{align}\]</span></p>
+<p><span class="math display">\[\begin{align}
+\hat{P}_{\hat{\theta}}(Y = 1 | x) = \sigma(x^{\top}\hat{\theta}) = \sigma(0.1 \cdot 15 + (-0.5) \cdot 1) = \sigma(1) = \frac{1}{1+e^{-1}} \approx 0.7311
+\end{align}\]</span></p>
+<p>We see that the response is more likely to be 1 than 0, indicating that a reasonable prediction is <span class="math inline">\(\hat{y} = 1\)</span>. We’ll dive deeper into this in the next lecture.</p>
+</div>
+</div>
+</section>
+<section id="cross-entropy-loss" class="level2" data-number="22.4">
+<h2 data-number="22.4" class="anchored" data-anchor-id="cross-entropy-loss"><span class="header-section-number">22.4</span> Cross-Entropy Loss</h2>
+<p>To quantify the error of our logistic regression model, we’ll need to define a new loss function.</p>
+<section id="why-not-mse" class="level3" data-number="22.4.1">
+<h3 data-number="22.4.1" class="anchored" data-anchor-id="why-not-mse"><span class="header-section-number">22.4.1</span> Why Not MSE?</h3>
+<p>You may wonder: why not use our familiar mean squared error? It turns out that the MSE is not well suited for logistic regression. To see why, let’s consider a simple, artificially generated <code>toy</code> dataset with just one feature (this will be easier to work with than the more complicated <code>games</code> data).</p>
+<div id="c9b54eed" class="cell" data-execution_count="8">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>toy_df <span class="op">=</span> pd.DataFrame({</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>        <span class="st">"x"</span>: [<span class="op">-</span><span class="dv">4</span>, <span class="op">-</span><span class="dv">2</span>, <span class="op">-</span><span class="fl">0.5</span>, <span class="dv">1</span>, <span class="dv">3</span>, <span class="dv">5</span>],</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>        <span class="st">"y"</span>: [<span class="dv">0</span>, <span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">1</span>]})</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>toy_df.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">x</th>
+<th data-quarto-table-cell-role="th">y</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>-4.0</td>
+<td>0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>-2.0</td>
+<td>0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>-0.5</td>
+<td>1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1.0</td>
+<td>0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>3.0</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We’ll construct a basic logistic regression model with only one feature and no intercept term. Our predicted probabilities take the form:</p>
+<p><span class="math display">\[p=P(Y=1|x)=\frac{1}{1+e^{-\theta_1 x}}\]</span></p>
+<p>In the cell below, we plot the MSE for our model on the data.</p>
+<div id="a9d5b5a6" class="cell" data-execution_count="9">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> sigmoid(z):</span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> <span class="dv">1</span><span class="op">/</span>(<span class="dv">1</span><span class="op">+</span>np.e<span class="op">**</span>(<span class="op">-</span>z))</span>
+<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mse_on_toy_data(theta):</span>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>    p_hat <span class="op">=</span> sigmoid(toy_df[<span class="st">'x'</span>] <span class="op">*</span> theta)</span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean((toy_df[<span class="st">'y'</span>] <span class="op">-</span> p_hat)<span class="op">**</span><span class="dv">2</span>)</span>
+<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">15</span>, <span class="dv">5</span>, <span class="dv">100</span>)</span>
+<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>plt.plot(thetas, [mse_on_toy_data(theta) <span class="cf">for</span> theta <span class="kw">in</span> thetas])</span>
+<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"MSE on toy classification data"</span>)</span>
+<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r'$\theta_1$'</span>)</span>
+<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'MSE'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="logistic_reg_1_files/figure-html/cell-10-output-1.png" width="589" height="451" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>This looks nothing like the parabola we found when plotting the MSE of a linear regression model! In particular, we can identify two flaws with using the MSE for logistic regression:</p>
+<ol type="1">
+<li>The MSE loss surface is <em>non-convex</em>. There is both a global minimum and a (barely perceptible) local minimum in the loss surface above. This means that there is the risk of gradient descent converging on the local minimum of the loss surface, missing the true optimum parameter <span class="math inline">\(\theta_1\)</span>.
+<center>
+<img src="images/global_local_min.png" alt="reg" width="400">
+</center></li>
+<li>Squared loss is <em>bounded</em> for a classification task. Recall that each true <span class="math inline">\(y\)</span> has a value of either 0 or 1. This means that even if our model makes the worst possible prediction (e.g.&nbsp;predicting <span class="math inline">\(p=0\)</span> for <span class="math inline">\(y=1\)</span>), the squared loss for an observation will be no greater than 1: <span class="math display">\[(y-p)^2=(1-0)^2=1\]</span> The MSE does not strongly penalize poor predictions.
+<center>
+<img src="images/squared_loss.png" alt="reg" width="400">
+</center></li>
+</ol>
+</section>
+<section id="motivating-cross-entropy-loss" class="level3" data-number="22.4.2">
+<h3 data-number="22.4.2" class="anchored" data-anchor-id="motivating-cross-entropy-loss"><span class="header-section-number">22.4.2</span> Motivating Cross-Entropy Loss</h3>
+<p>Suffice to say, we don’t want to use the MSE when working with logistic regression. Instead, we’ll consider what kind of behavior we would <em>like</em> to see in a loss function.</p>
+<p>Let <span class="math inline">\(y\)</span> be the binary label (it can either be 0 or 1), and <span class="math inline">\(p\)</span> be the model’s predicted probability of the label <span class="math inline">\(y\)</span> being 1.</p>
+<ul>
+<li>When the true <span class="math inline">\(y\)</span> is 1, we should incur <em>low</em> loss when the model predicts large <span class="math inline">\(p\)</span></li>
+<li>When the true <span class="math inline">\(y\)</span> is 0, we should incur <em>high</em> loss when the model predicts large <span class="math inline">\(p\)</span></li>
+</ul>
+<p>In other words, our loss function should behave differently depending on the value of the true class, <span class="math inline">\(y\)</span>.</p>
+<p>The <strong>cross-entropy loss</strong> incorporates this changing behavior. We will use it throughout our work on logistic regression. Below, we write out the cross-entropy loss for a <em>single</em> datapoint (no averages just yet).</p>
+<p><span class="math display">\[\text{Cross-Entropy Loss} = \begin{cases}
+  -\log{(p)}  &amp; \text{if } y=1 \\
+  -\log{(1-p)} &amp; \text{if } y=0
+\end{cases}\]</span></p>
+<p>Why does this (seemingly convoluted) loss function “work”? Let’s break it down.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 50%">
+<col style="width: 50%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>When <span class="math inline">\(y=1\)</span></th>
+<th>When <span class="math inline">\(y=0\)</span></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><img src="images/y=1.png" alt="cross-entropy loss when Y=1" width="300"></td>
+<td><img src="images/y=0.png" alt="cross-entropy loss when Y=0" width="300"></td>
+</tr>
+<tr class="even">
+<td>As <span class="math inline">\(p \rightarrow 0\)</span>, loss approches <span class="math inline">\(\infty\)</span></td>
+<td>As <span class="math inline">\(p \rightarrow 0\)</span>, loss approches 0</td>
+</tr>
+<tr class="odd">
+<td>As <span class="math inline">\(p \rightarrow 1\)</span>, loss approaches 0</td>
+<td>As <span class="math inline">\(p \rightarrow 1\)</span>, loss approaches <span class="math inline">\(\infty\)</span></td>
+</tr>
+</tbody>
+</table>
+<!-- :::: {.columns}
+
+::: {.column width="35%"}
+When $y=1$
+<center><img src="images/y=1.png" alt='cross-entropy loss when Y=1' width='300'></center>
+
+* As $p \rightarrow 0$, loss approches $\infty$
+* As $p \rightarrow 1$, loss approaches 0
+  
+:::
+
+::: {.column width="20%"}
+:::
+
+::: {.column width="35%"}
+When $y=0$
+<center><img src="images/y=0.png" alt='cross-entropy loss when Y=0' width='300'></center>
+
+* As $p \rightarrow 0$, loss approches 0
+* As $p \rightarrow 1$, loss approaches $\infty$
+  
+:::
+
+:::: -->
+<p>All good – we are seeing the behavior we want for our logistic regression model.</p>
+<p>The piecewise function we outlined above is difficult to optimize: we don’t want to constantly “check” which form of the loss function we should be using at each step of choosing the optimal model parameters. We can re-express cross-entropy loss in a more convenient way:</p>
+<p><span class="math display">\[\text{Cross-Entropy Loss} = -\left(y\log{(p)}+(1-y)\log{(1-p)}\right)\]</span></p>
+<p>By setting <span class="math inline">\(y\)</span> to 0 or 1, we see that this new form of cross-entropy loss gives us the same behavior as the original formulation. Another way to think about this is that in either scenario (y being equal to 0 or 1), only one of the cross-entropy loss terms is activated, which gives us a convenient way to combine the two independent loss functions.</p>
+<div class="columns">
+<div class="column" style="width:35%;">
+<p>When <span class="math inline">\(y=1\)</span>:</p>
+<p><span class="math display">\[\begin{align}
+\text{CE} &amp;= -\left((1)\log{(p)}+(1-1)\log{(1-p)}\right)\\
+&amp;= -\log{(p)}
+\end{align}\]</span></p>
+</div><div class="column" style="width:20%;">
+
+</div><div class="column" style="width:35%;">
+<p>When <span class="math inline">\(y=0\)</span>:</p>
+<p><span class="math display">\[\begin{align}
+\text{CE} &amp;= -\left((0)\log{(p)}+(1-0)\log{(1-p)}\right)\\
+&amp;= -\log{(1-p)}
+\end{align}\]</span></p>
+</div>
+</div>
+<p>The empirical risk of the logistic regression model is then the mean cross-entropy loss across all datapoints in the dataset. When fitting the model, we want to determine the model parameter <span class="math inline">\(\theta\)</span> that leads to the lowest mean cross-entropy loss possible.</p>
+<p><span class="math display">\[
+\begin{align}
+R(\theta) &amp;= - \frac{1}{n} \sum_{i=1}^n \left(y_i\log{(p_i)}+(1-y_i)\log{(1-p_i)}\right) \\
+&amp;= - \frac{1}{n} \sum_{i=1}^n \left(y_i\log{\sigma(X_i^{\top}\theta)}+(1-y_i)\log{(1-\sigma(X_i^{\top}\theta))}\right)
+\end{align}
+\]</span></p>
+<p>The optimization problem is therefore to find the estimate <span class="math inline">\(\hat{\theta}\)</span> that minimizes <span class="math inline">\(R(\theta)\)</span>:</p>
+<p><span class="math display">\[
+\hat{\theta} = \underset{\theta}{\arg\min} - \frac{1}{n} \sum_{i=1}^n \left(y_i\log{(\sigma(X_i^{\top}\theta))}+(1-y_i)\log{(1-\sigma(X_i^{\top}\theta))}\right)
+\]</span></p>
+<p>Plotting the cross-entropy loss surface for our <code>toy</code> dataset gives us a more encouraging result – our loss function is now convex. This means we can optimize it using gradient descent. Computing the gradient of the logistic model is fairly challenging, so we’ll let <code>sklearn</code> take care of this for us. You won’t need to compute the gradient of the logistic model in Data 100.</p>
+<div id="066f2d84" class="cell" data-execution_count="10">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> cross_entropy(y, p_hat):</span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> <span class="op">-</span> y <span class="op">*</span> np.log(p_hat) <span class="op">-</span> (<span class="dv">1</span> <span class="op">-</span> y) <span class="op">*</span> np.log(<span class="dv">1</span> <span class="op">-</span> p_hat)</span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> mean_cross_entropy_on_toy_data(theta):</span>
+<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>    p_hat <span class="op">=</span> sigmoid(toy_df[<span class="st">'x'</span>] <span class="op">*</span> theta)</span>
+<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> np.mean(cross_entropy(toy_df[<span class="st">'y'</span>], p_hat))</span>
+<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a>plt.plot(thetas, [mean_cross_entropy_on_toy_data(theta) <span class="cf">for</span> theta <span class="kw">in</span> thetas], color <span class="op">=</span> <span class="st">'green'</span>)</span>
+<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="vs">r'Mean Cross-Entropy Loss($\theta$)'</span>)</span>
+<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r'$\theta$'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="logistic_reg_1_files/figure-html/cell-11-output-1.png" width="587" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="maximum-likelihood-estimation" class="level2" data-number="22.5">
+<h2 data-number="22.5" class="anchored" data-anchor-id="maximum-likelihood-estimation"><span class="header-section-number">22.5</span> Maximum Likelihood Estimation</h2>
+<p>It may have seemed like we pulled cross-entropy loss out of thin air. How did we know that taking the negative logarithms of our probabilities would work so well? It turns out that cross-entropy loss is justified by probability theory.</p>
+<p>The following section is out of scope, but is certainly an interesting read!</p>
+<section id="building-intuition-the-coin-flip" class="level3" data-number="22.5.1">
+<h3 data-number="22.5.1" class="anchored" data-anchor-id="building-intuition-the-coin-flip"><span class="header-section-number">22.5.1</span> Building Intuition: The Coin Flip</h3>
+<p>To build some intuition for logistic regression, let’s look at an introductory example to classification: the coin flip. Suppose we observe some outcomes of a coin flip (1 = Heads, 0 = Tails).</p>
+<div id="5673285a" class="cell" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>flips <span class="op">=</span> [<span class="dv">0</span>, <span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">0</span>, <span class="dv">0</span>, <span class="dv">0</span>, <span class="dv">0</span>]</span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>flips</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="11">
+<pre><code>[0, 0, 1, 1, 1, 1, 0, 0, 0, 0]</code></pre>
+</div>
+</div>
+<p>A reasonable model is to assume all flips are IID (independent and identically distributed). In other words, each flip has the same probability of returning a 1 (or heads). Let’s define a parameter <span class="math inline">\(\theta\)</span>, the probability that the next flip is a heads. We will use this parameter to inform our decision for <span class="math inline">\(\hat y\)</span> (predicting either 0 or 1) of the next flip. If <span class="math inline">\(\theta \ge 0.5, \hat y = 1, \text{else } \hat y = 0\)</span>.</p>
+<p>You may be inclined to say <span class="math inline">\(0.5\)</span> is the best choice for <span class="math inline">\(\theta\)</span>. However, notice that we made no assumption about the coin itself. The coin may be biased, so we should make our decision based only on the data. We know that exactly <span class="math inline">\(\frac{4}{10}\)</span> of the flips were heads, so we might guess <span class="math inline">\(\hat \theta = 0.4\)</span>. In the next section, we will mathematically prove why this is the best possible estimate.</p>
+</section>
+<section id="likelihood-of-data" class="level3" data-number="22.5.2">
+<h3 data-number="22.5.2" class="anchored" data-anchor-id="likelihood-of-data"><span class="header-section-number">22.5.2</span> Likelihood of Data</h3>
+<p>Let’s call the result of the coin flip a random variable <span class="math inline">\(Y\)</span>. This is a Bernoulli random variable with two outcomes. <span class="math inline">\(Y\)</span> has the following distribution:</p>
+<p><span class="math display">\[P(Y = y) = \begin{cases}
+        p, \text{if }  y=1\\
+        1 - p, \text{if }  y=0
+    \end{cases} \]</span></p>
+<p><span class="math inline">\(p\)</span> is unknown to us. But we can find the <span class="math inline">\(p\)</span> that makes the data we observed the most <em>likely</em>.</p>
+<p>The probability of observing 4 heads and 6 tails follows the binomial distribution.</p>
+<p><span class="math display">\[\binom{10}{4} (p)^4 (1-p)^6\]</span></p>
+<p>We define the <strong>likelihood</strong> of obtaining our observed data as a quantity <em>proportional</em> to the probability above. To find it, simply multiply the probabilities of obtaining each coin flip.</p>
+<p><span class="math display">\[(p)^{4} (1-p)^6\]</span></p>
+<p>The technique known as <strong>maximum likelihood estimation</strong> finds the <span class="math inline">\(p\)</span> that maximizes the above likelihood. You can find this maximum by taking the derivative of the likelihood, but we’ll provide a more intuitive graphical solution.</p>
+<div id="d3bf0b2f" class="cell" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> np.linspace(<span class="dv">0</span>, <span class="dv">1</span>)</span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>plt.plot(thetas, (thetas<span class="op">**</span><span class="dv">4</span>)<span class="op">*</span>(<span class="dv">1</span><span class="op">-</span>thetas)<span class="op">**</span><span class="dv">6</span>)</span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r"$\theta$"</span>)</span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Likelihood"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="logistic_reg_1_files/figure-html/cell-13-output-1.png" width="614" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>More generally, the likelihood for some Bernoulli(<span class="math inline">\(p\)</span>) random variable <span class="math inline">\(Y\)</span> is:</p>
+<p><span class="math display">\[P(Y = y) = \begin{cases}
+        1, \text{with probability }  p\\
+        0, \text{with probability }  1 - p
+    \end{cases} \]</span></p>
+<p>Equivalently, this can be written in a compact way:</p>
+<p><span class="math display">\[P(Y=y) = p^y(1-p)^{1-y}\]</span></p>
+<ul>
+<li>When <span class="math inline">\(y = 1\)</span>, this reads <span class="math inline">\(P(Y=y) = p\)</span></li>
+<li>When <span class="math inline">\(y = 0\)</span>, this reads <span class="math inline">\(P(Y=y) = (1-p)\)</span></li>
+</ul>
+<p>In our example, a Bernoulli random variable is analogous to a single data point (e.g., one instance of a basketball team winning or losing a game). All together, our <code>games</code> data consists of many IID Bernoulli(<span class="math inline">\(p\)</span>) random variables. To find the likelihood of independent events in succession, simply multiply their likelihoods.</p>
+<p><span class="math display">\[\prod_{i=1}^{n} p^{y_i} (1-p)^{1-y_i}\]</span></p>
+<p>As with the coin example, we want to find the parameter <span class="math inline">\(p\)</span> that maximizes this likelihood. Earlier, we gave an intuitive graphical solution, but let’s take the derivative of the likelihood to find this maximum.</p>
+<p>At a first glance, this derivative will be complicated! We will have to use the product rule, followed by the chain rule. Instead, we can make an observation that simplifies the problem.</p>
+<p>Finding the <span class="math inline">\(p\)</span> that maximizes <span class="math display">\[\prod_{i=1}^{n} p^{y_i} (1-p)^{1-y_i}\]</span> is equivalent to the <span class="math inline">\(p\)</span> that maximizes <span class="math display">\[\text{log}(\prod_{i=1}^{n} p^{y_i} (1-p)^{1-y_i})\]</span></p>
+<p>This is because <span class="math inline">\(\text{log}\)</span> is a strictly <em>increasing</em> function. It won’t change the maximum or minimum of the function it was applied to. From <span class="math inline">\(\text{log}\)</span> properties, <span class="math inline">\(\text{log}(a*b)\)</span> = <span class="math inline">\(\text{log}(a) + \text{log}(b)\)</span>. We can apply this to our equation above to get:</p>
+<p><span class="math display">\[\underset{p}{\text{argmax}} \sum_{i=1}^{n} \text{log}(p^{y_i} (1-p)^{1-y_i})\]</span></p>
+<p><span class="math display">\[= \underset{p}{\text{argmax}} \sum_{i=1}^{n} (\text{log}(p^{y_i}) + \text{log}((1-p)^{1-y_i}))\]</span></p>
+<p><span class="math display">\[= \underset{p}{\text{argmax}} \sum_{i=1}^{n} (y_i\text{log}(p) + (1-y_i)\text{log}(1-p))\]</span></p>
+<p>We can add a constant factor of <span class="math inline">\(\frac{1}{n}\)</span> out front. It won’t affect the <span class="math inline">\(p\)</span> that maximizes our likelihood.</p>
+<p><span class="math display">\[=\underset{p}{\text{argmax}}  \frac{1}{n} \sum_{i=1}^{n} y_i\text{log}(p) + (1-y_i)\text{log}(1-p)\]</span></p>
+<p>One last “trick” we can do is change this to a minimization problem by negating the result. This works because we are dealing with a <em>concave</em> function, which can be made <em>convex</em>.</p>
+<p><span class="math display">\[= \underset{p}{\text{argmin}} -\frac{1}{n} \sum_{i=1}^{n} y_i\text{log}(p) + (1-y_i)\text{log}(1-p)\]</span></p>
+<p>Now let’s say that we have data that are independent with different probability <span class="math inline">\(p_i\)</span>. Then, we would want to find the <span class="math inline">\(p_1, p_2, \dots, p_n\)</span> that maximize <span class="math display">\[\prod_{i=1}^{n} p_i^{y_i} (1-p_i)^{1-y_i}\]</span></p>
+<p>Setting up and simplifying the optimization problems as we did above, we ultimately want to find:</p>
+<p><span class="math display">\[= \underset{p}{\text{argmin}} -\frac{1}{n} \sum_{i=1}^{n} y_i\text{log}(p_i) + (1-y_i)\text{log}(1-p_i)\]</span></p>
+<p>For logistic regression, <span class="math inline">\(p_i = \sigma(x^{\top}\theta)\)</span>. Plugging that in, we get:</p>
+<p><span class="math display">\[= \underset{p}{\text{argmin}} -\frac{1}{n} \sum_{i=1}^{n} y_i\text{log}(\sigma(x^{\top}\theta)) + (1-y_i)\text{log}(1-\sigma(x^{\top}\theta))\]</span></p>
+<p>This is exactly our average cross-entropy loss minimization problem from before!</p>
+<p>Why did we do all this complicated math? We have shown that <em>minimizing</em> cross-entropy loss is equivalent to <em>maximizing</em> the likelihood of the training data.</p>
+<ul>
+<li>By minimizing cross-entropy loss, we are choosing the model parameters that are “most likely” for the data we observed.</li>
+</ul>
+<p>Note that this is under the assumption that all data is drawn independently from the same logistic regression model with parameter <span class="math inline">\(\theta\)</span>. In fact, many of the model + loss combinations we’ve seen can be motivated using MLE (e.g., OLS, Ridge Regression, etc.). In probability and ML classes, you’ll get the chance to explore MLE further.</p>
+
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../sql_II/sql_II.html" class="pagination-link" aria-label="SQL II">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../logistic_regression_2/logistic_reg_2.html" class="pagination-link" aria-label="Logistic Regression II">
+        <span class="nav-page-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-10-output-1.png b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-10-output-1.png
new file mode 100644
index 000000000..cb138d627
Binary files /dev/null and b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-10-output-1.png differ
diff --git a/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-11-output-1.png b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-11-output-1.png
new file mode 100644
index 000000000..b40928a13
Binary files /dev/null and b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-11-output-1.png differ
diff --git a/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-13-output-1.png b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-13-output-1.png
new file mode 100644
index 000000000..19904c8ee
Binary files /dev/null and b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-13-output-1.png differ
diff --git a/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-3-output-1.png b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-3-output-1.png
new file mode 100644
index 000000000..12bcebf36
Binary files /dev/null and b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-3-output-1.png differ
diff --git a/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-4-output-1.png b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-4-output-1.png
new file mode 100644
index 000000000..85112db7f
Binary files /dev/null and b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-4-output-1.png differ
diff --git a/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-5-output-1.png b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-5-output-1.png
new file mode 100644
index 000000000..7a64687f4
Binary files /dev/null and b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-5-output-1.png differ
diff --git a/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-6-output-1.png b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-6-output-1.png
new file mode 100644
index 000000000..988b2fd9a
Binary files /dev/null and b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-6-output-1.png differ
diff --git a/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-7-output-1.png b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-7-output-1.png
new file mode 100644
index 000000000..4cdeeec53
Binary files /dev/null and b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-7-output-1.png differ
diff --git a/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-8-output-1.png b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-8-output-1.png
new file mode 100644
index 000000000..f7463e092
Binary files /dev/null and b/docs/logistic_regression_1/logistic_reg_1_files/figure-html/cell-8-output-1.png differ
diff --git a/docs/logistic_regression_2/images/confusion_matrix.png b/docs/logistic_regression_2/images/confusion_matrix.png
new file mode 100644
index 000000000..75fff830a
Binary files /dev/null and b/docs/logistic_regression_2/images/confusion_matrix.png differ
diff --git a/docs/logistic_regression_2/images/confusion_matrix_sklearn.png b/docs/logistic_regression_2/images/confusion_matrix_sklearn.png
new file mode 100644
index 000000000..8126cd8d2
Binary files /dev/null and b/docs/logistic_regression_2/images/confusion_matrix_sklearn.png differ
diff --git a/docs/logistic_regression_2/images/decision_boundary.png b/docs/logistic_regression_2/images/decision_boundary.png
new file mode 100644
index 000000000..df94c58eb
Binary files /dev/null and b/docs/logistic_regression_2/images/decision_boundary.png differ
diff --git a/docs/logistic_regression_2/images/decision_boundary_true.png b/docs/logistic_regression_2/images/decision_boundary_true.png
new file mode 100644
index 000000000..d3b39b6d6
Binary files /dev/null and b/docs/logistic_regression_2/images/decision_boundary_true.png differ
diff --git a/docs/logistic_regression_2/images/linear_separability_1D.png b/docs/logistic_regression_2/images/linear_separability_1D.png
new file mode 100644
index 000000000..98586398e
Binary files /dev/null and b/docs/logistic_regression_2/images/linear_separability_1D.png differ
diff --git a/docs/logistic_regression_2/images/linear_separability_2D.png b/docs/logistic_regression_2/images/linear_separability_2D.png
new file mode 100644
index 000000000..6b7af88c4
Binary files /dev/null and b/docs/logistic_regression_2/images/linear_separability_2D.png differ
diff --git a/docs/logistic_regression_2/images/log_reg_summary.png b/docs/logistic_regression_2/images/log_reg_summary.png
new file mode 100644
index 000000000..f7671b509
Binary files /dev/null and b/docs/logistic_regression_2/images/log_reg_summary.png differ
diff --git a/docs/logistic_regression_2/images/mean_cross_entropy_loss_plot.png b/docs/logistic_regression_2/images/mean_cross_entropy_loss_plot.png
new file mode 100644
index 000000000..4e8f9a1d8
Binary files /dev/null and b/docs/logistic_regression_2/images/mean_cross_entropy_loss_plot.png differ
diff --git a/docs/logistic_regression_2/images/pr_curve_perfect.png b/docs/logistic_regression_2/images/pr_curve_perfect.png
new file mode 100644
index 000000000..cfb5f2d92
Binary files /dev/null and b/docs/logistic_regression_2/images/pr_curve_perfect.png differ
diff --git a/docs/logistic_regression_2/images/pr_curve_thresholds.png b/docs/logistic_regression_2/images/pr_curve_thresholds.png
new file mode 100644
index 000000000..c01f478d7
Binary files /dev/null and b/docs/logistic_regression_2/images/pr_curve_thresholds.png differ
diff --git a/docs/logistic_regression_2/images/precision-recall-thresh.png b/docs/logistic_regression_2/images/precision-recall-thresh.png
new file mode 100644
index 000000000..c1dc555af
Binary files /dev/null and b/docs/logistic_regression_2/images/precision-recall-thresh.png differ
diff --git a/docs/logistic_regression_2/images/precision_recall_graphic.png b/docs/logistic_regression_2/images/precision_recall_graphic.png
new file mode 100644
index 000000000..241c8fc4b
Binary files /dev/null and b/docs/logistic_regression_2/images/precision_recall_graphic.png differ
diff --git a/docs/logistic_regression_2/images/reg_loss_finite_argmin.png b/docs/logistic_regression_2/images/reg_loss_finite_argmin.png
new file mode 100644
index 000000000..68c670dfe
Binary files /dev/null and b/docs/logistic_regression_2/images/reg_loss_finite_argmin.png differ
diff --git a/docs/logistic_regression_2/images/roc_curve.png b/docs/logistic_regression_2/images/roc_curve.png
new file mode 100644
index 000000000..273b0b557
Binary files /dev/null and b/docs/logistic_regression_2/images/roc_curve.png differ
diff --git a/docs/logistic_regression_2/images/roc_curve_perfect.png b/docs/logistic_regression_2/images/roc_curve_perfect.png
new file mode 100644
index 000000000..42a9d8488
Binary files /dev/null and b/docs/logistic_regression_2/images/roc_curve_perfect.png differ
diff --git a/docs/logistic_regression_2/images/roc_curve_worse_predictor_differing_T.png b/docs/logistic_regression_2/images/roc_curve_worse_predictor_differing_T.png
new file mode 100644
index 000000000..1180046cb
Binary files /dev/null and b/docs/logistic_regression_2/images/roc_curve_worse_predictor_differing_T.png differ
diff --git a/docs/logistic_regression_2/images/roc_curve_worst_predictor.png b/docs/logistic_regression_2/images/roc_curve_worst_predictor.png
new file mode 100644
index 000000000..d2b478771
Binary files /dev/null and b/docs/logistic_regression_2/images/roc_curve_worst_predictor.png differ
diff --git a/docs/logistic_regression_2/images/toy_linear_separable_dataset.png b/docs/logistic_regression_2/images/toy_linear_separable_dataset.png
new file mode 100644
index 000000000..316f271a0
Binary files /dev/null and b/docs/logistic_regression_2/images/toy_linear_separable_dataset.png differ
diff --git a/docs/logistic_regression_2/images/toy_linear_separable_dataset_2.png b/docs/logistic_regression_2/images/toy_linear_separable_dataset_2.png
new file mode 100644
index 000000000..3e60a7c93
Binary files /dev/null and b/docs/logistic_regression_2/images/toy_linear_separable_dataset_2.png differ
diff --git a/docs/logistic_regression_2/images/tpr_fpr.png b/docs/logistic_regression_2/images/tpr_fpr.png
new file mode 100644
index 000000000..69d8df649
Binary files /dev/null and b/docs/logistic_regression_2/images/tpr_fpr.png differ
diff --git a/docs/logistic_regression_2/images/unreg_loss_infinite_argmin.png b/docs/logistic_regression_2/images/unreg_loss_infinite_argmin.png
new file mode 100644
index 000000000..550015129
Binary files /dev/null and b/docs/logistic_regression_2/images/unreg_loss_infinite_argmin.png differ
diff --git a/docs/logistic_regression_2/images/varying_threshold.png b/docs/logistic_regression_2/images/varying_threshold.png
new file mode 100644
index 000000000..fd146550a
Binary files /dev/null and b/docs/logistic_regression_2/images/varying_threshold.png differ
diff --git a/docs/logistic_regression_2/logistic_reg_2.html b/docs/logistic_regression_2/logistic_reg_2.html
new file mode 100644
index 000000000..ad63ed2fe
--- /dev/null
+++ b/docs/logistic_regression_2/logistic_reg_2.html
@@ -0,0 +1,1186 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>23&nbsp; Logistic Regression II – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../pca_1/pca_1.html" rel="next">
+<link href="../logistic_regression_1/logistic_reg_1.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../logistic_regression_2/logistic_reg_2.html"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#decision-boundaries" id="toc-decision-boundaries" class="nav-link active" data-scroll-target="#decision-boundaries"><span class="header-section-number">23.1</span> Decision Boundaries</a></li>
+  <li><a href="#linear-separability-and-regularization" id="toc-linear-separability-and-regularization" class="nav-link" data-scroll-target="#linear-separability-and-regularization"><span class="header-section-number">23.2</span> Linear Separability and Regularization</a>
+  <ul>
+  <li><a href="#regularized-logistic-regression" id="toc-regularized-logistic-regression" class="nav-link" data-scroll-target="#regularized-logistic-regression"><span class="header-section-number">23.2.1</span> Regularized Logistic Regression</a></li>
+  </ul></li>
+  <li><a href="#performance-metrics" id="toc-performance-metrics" class="nav-link" data-scroll-target="#performance-metrics"><span class="header-section-number">23.3</span> Performance Metrics</a>
+  <ul>
+  <li><a href="#types-of-classification" id="toc-types-of-classification" class="nav-link" data-scroll-target="#types-of-classification"><span class="header-section-number">23.3.1</span> Types of Classification</a></li>
+  <li><a href="#accuracy-precision-and-recall" id="toc-accuracy-precision-and-recall" class="nav-link" data-scroll-target="#accuracy-precision-and-recall"><span class="header-section-number">23.3.2</span> Accuracy, Precision, and Recall</a></li>
+  <li><a href="#example-calculation" id="toc-example-calculation" class="nav-link" data-scroll-target="#example-calculation"><span class="header-section-number">23.3.3</span> Example Calculation</a>
+  <ul>
+  <li><a href="#model-1" id="toc-model-1" class="nav-link" data-scroll-target="#model-1"><span class="header-section-number">23.3.3.1</span> Model 1</a></li>
+  <li><a href="#model-2" id="toc-model-2" class="nav-link" data-scroll-target="#model-2"><span class="header-section-number">23.3.3.2</span> Model 2</a></li>
+  </ul></li>
+  <li><a href="#precision-vs.-recall" id="toc-precision-vs.-recall" class="nav-link" data-scroll-target="#precision-vs.-recall"><span class="header-section-number">23.3.4</span> Precision vs.&nbsp;Recall</a></li>
+  <li><a href="#three-more-metrics" id="toc-three-more-metrics" class="nav-link" data-scroll-target="#three-more-metrics"><span class="header-section-number">23.3.5</span> Three More Metrics</a></li>
+  </ul></li>
+  <li><a href="#adjusting-the-classification-threshold" id="toc-adjusting-the-classification-threshold" class="nav-link" data-scroll-target="#adjusting-the-classification-threshold"><span class="header-section-number">23.4</span> Adjusting the Classification Threshold</a>
+  <ul>
+  <li><a href="#precision-recall-curves" id="toc-precision-recall-curves" class="nav-link" data-scroll-target="#precision-recall-curves"><span class="header-section-number">23.4.1</span> Precision-Recall Curves</a></li>
+  <li><a href="#the-roc-curve" id="toc-the-roc-curve" class="nav-link" data-scroll-target="#the-roc-curve"><span class="header-section-number">23.4.2</span> The ROC Curve</a>
+  <ul>
+  <li><a href="#extra-what-is-the-worst-auc-and-why-is-it-0.5" id="toc-extra-what-is-the-worst-auc-and-why-is-it-0.5" class="nav-link" data-scroll-target="#extra-what-is-the-worst-auc-and-why-is-it-0.5"><span class="header-section-number">23.4.2.1</span> (Extra) What is the “worst” AUC, and why is it 0.5?</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#bonus-gradient-descent-for-logistic-regression" id="toc-bonus-gradient-descent-for-logistic-regression" class="nav-link" data-scroll-target="#bonus-gradient-descent-for-logistic-regression"><span class="header-section-number">23.5</span> (Bonus) Gradient Descent for Logistic Regression</a>
+  <ul>
+  <li><a href="#gradient-descent-update-rule" id="toc-gradient-descent-update-rule" class="nav-link" data-scroll-target="#gradient-descent-update-rule"><span class="header-section-number">23.5.1</span> Gradient Descent Update Rule</a></li>
+  <li><a href="#stochastic-gradient-descent-update-rule" id="toc-stochastic-gradient-descent-update-rule" class="nav-link" data-scroll-target="#stochastic-gradient-descent-update-rule"><span class="header-section-number">23.5.2</span> Stochastic Gradient Descent Update Rule</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Apply decision rules to make a classification</li>
+<li>Learn when logistic regression works well and when it does not</li>
+<li>Introduce new metrics for model performance</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Today, we will continue studying the Logistic Regression model and discuss decision boundaries that help inform the classification of a particular prediction and learn about linear separability. Picking up from last lecture’s discussion of cross-entropy loss, we’ll study a few of its pitfalls, and learn potential remedies. We will also provide an implementation of <code>sklearn</code>’s logistic regression model. Lastly, we’ll return to decision rules and discuss metrics that allow us to determine our model’s performance in different scenarios.</p>
+<p>This will introduce us to the process of <strong>thresholding</strong> – a technique used to <em>classify</em> data from our model’s predicted probabilities, or <span class="math inline">\(P(Y=1|x)\)</span>. In doing so, we’ll focus on how these thresholding decisions affect the behavior of our model and learn various evaluation metrics useful for binary classification, and apply them to our study of logistic regression.</p>
+<section id="decision-boundaries" class="level2" data-number="23.1">
+<h2 data-number="23.1" class="anchored" data-anchor-id="decision-boundaries"><span class="header-section-number">23.1</span> Decision Boundaries</h2>
+<p>In logistic regression, we model the <em>probability</em> that a datapoint belongs to Class 1.</p>
+<center>
+<img src="images/log_reg_summary.png" alt="tpr_fpr" width="800">
+</center>
+<p><br> Last week, we developed the logistic regression model to predict that probability, but we never actually made any <em>classifications</em> for whether our prediction <span class="math inline">\(y\)</span> belongs in Class 0 or Class 1.</p>
+<p><span class="math display">\[ p = P(Y=1 | x) = \frac{1}{1 + e^{-x^{\top}\theta}}\]</span></p>
+<p>A <strong>decision rule</strong> tells us how to interpret the output of the model to make a decision on how to classify a datapoint. We commonly make decision rules by specifying a <strong>threshold</strong>, <span class="math inline">\(T\)</span>. If the predicted probability is greater than or equal to <span class="math inline">\(T\)</span>, predict Class 1. Otherwise, predict Class 0.</p>
+<p><span class="math display">\[\hat y = \text{classify}(x) = \begin{cases}
+        1, &amp; P(Y=1|x) \ge T\\
+        0, &amp; \text{otherwise }
+    \end{cases}\]</span></p>
+<p>The threshold is often set to <span class="math inline">\(T = 0.5\)</span>, but <em>not always</em>. We’ll discuss why we might want to use other thresholds <span class="math inline">\(T \neq 0.5\)</span> later in this lecture.</p>
+<p>Using our decision rule, we can define a <strong>decision boundary</strong> as the “line” that splits the data into classes based on its features. For logistic regression, since we are working in <span class="math inline">\(p\)</span> dimensions, the decision boundary is a <strong>hyperplane</strong> – a linear combination of the features in <span class="math inline">\(p\)</span>-dimensions – and we can recover it from the final logistic regression model. For example, if we have a model with 2 features (2D), we have <span class="math inline">\(\theta = [\theta_0, \theta_1, \theta_2]\)</span> including the intercept term, and we can solve for the decision boundary like so:</p>
+<p><span class="math display">\[
+\begin{align}
+T &amp;= \frac{1}{1 + e^{-(\theta_0 + \theta_1 * \text{feature1} +  \theta_2 * \text{feature2})}} \\
+1 + e^{-(\theta_0 + \theta_1 \cdot \text{feature1} +  \theta_2  \cdot  \text{feature2})} &amp;= \frac{1}{T} \\
+e^{-(\theta_0 + \theta_1  \cdot  \text{feature1} +  \theta_2  \cdot  \text{feature2})} &amp;= \frac{1}{T} - 1 \\
+\theta_0 + \theta_1  \cdot  \text{feature1} +  \theta_2  \cdot  \text{feature2} &amp;= -\log(\frac{1}{T} - 1)
+\end{align}
+\]</span></p>
+<p>For a model with 2 features, the decision boundary is a line in terms of its features. To make it easier to visualize, we’ve included an example of a 1-dimensional and a 2-dimensional decision boundary below. Notice how the decision boundary predicted by our logistic regression model perfectly separates the points into two classes. Here the color is the <em>predicted</em> class, rather than the true class.</p>
+<center>
+<img src="images/decision_boundary.png" alt="varying_threshold" width="800">
+</center>
+<p>In real life, however, that is often not the case, and we often see some overlap between points of different classes across the decision boundary. The <em>true</em> classes of the 2D data are shown below:</p>
+<center>
+<img src="images/decision_boundary_true.png" alt="varying_threshold" width="400">
+</center>
+<p>As you can see, the decision boundary predicted by our logistic regression does not perfectly separate the two classes. There’s a “muddled” region near the decision boundary where our classifier predicts the wrong class. What would the data have to look like for the classifier to make perfect predictions?</p>
+</section>
+<section id="linear-separability-and-regularization" class="level2" data-number="23.2">
+<h2 data-number="23.2" class="anchored" data-anchor-id="linear-separability-and-regularization"><span class="header-section-number">23.2</span> Linear Separability and Regularization</h2>
+<p>A classification dataset is said to be <strong>linearly separable</strong> if there exists a hyperplane <strong>among input features <span class="math inline">\(x\)</span></strong> that separates the two classes <span class="math inline">\(y\)</span>.</p>
+<p>Linear separability in 1D can be found with a rugplot of a single feature where a point perfectly separates the classes (Remember that in 1D, our decision boundary is just a point). For example, notice how the plot on the bottom left is linearly separable along the vertical line <span class="math inline">\(x=0\)</span>. However, no such line perfectly separates the two classes on the bottom right.</p>
+<center>
+<img src="images/linear_separability_1D.png" alt="linear_separability_1D" width="800">
+</center>
+<p>This same definition holds in higher dimensions. If there are two features, the separating hyperplane must exist in two dimensions (any line of the form <span class="math inline">\(y=mx+b\)</span>). We can visualize this using a scatter plot.</p>
+<center>
+<img src="images/linear_separability_2D.png" alt="linear_separability_1D" width="800">
+</center>
+<p>This sounds great! When the dataset is linearly separable, a logistic regression classifier can perfectly assign datapoints into classes. Can it achieve 0 cross-entropy loss?</p>
+<p><span class="math display">\[-(y \log(p) + (1 - y) \log(1 - p))\]</span></p>
+<p>Cross-entropy loss is 0 if <span class="math inline">\(p = 1\)</span> when <span class="math inline">\(y = 1\)</span>, and <span class="math inline">\(p = 0\)</span> when <span class="math inline">\(y = 0\)</span>. Consider a simple model with one feature and no intercept.</p>
+<p><span class="math display">\[P_{\theta}(Y = 1|x) = \sigma(\theta x) = \frac{1}{1 + e^{-\theta x}}\]</span></p>
+<p>What <span class="math inline">\(\theta\)</span> will achieve 0 loss if we train on the datapoint <span class="math inline">\(x = 1, y = 1\)</span>? We would want <span class="math inline">\(p = 1\)</span> which occurs when <span class="math inline">\(\theta \rightarrow \infty\)</span>.</p>
+<p>However, (unexpected) complications may arise. When data is linearly separable, the optimal model parameters <strong>diverge</strong> to <span class="math inline">\(\pm \infty\)</span>. <em>The sigmoid can never output exactly 0 or 1</em>, so no finite optimal <span class="math inline">\(\theta\)</span> exists. This can be a problem when using gradient descent to fit the model. Consider a simple, linearly separable “toy” dataset with two datapoints.</p>
+<center>
+<img src="images/toy_linear_separable_dataset.png" alt="toy_linear_separability" width="500">
+</center>
+<p>Let’s also visualize the mean cross entropy loss along with the direction of the gradient (how this loss surface is calculated is out of scope).</p>
+<center>
+<img src="images/mean_cross_entropy_loss_plot.png" alt="mean_cross_entropy_loss_plot" width="450">
+</center>
+<p>It’s nearly impossible to see, but the plateau to the right is slightly tilted. Because gradient descent follows the tilted loss surface downwards, it never converges.</p>
+<p>The diverging weights cause the model to be <strong>overconfident</strong>. Say we add a new point <span class="math inline">\((x, y) = (-0.5, 1)\)</span>. Following the behavior above, our model will incorrectly predict <span class="math inline">\(p=0\)</span>, and thus, <span class="math inline">\(\hat y = 0\)</span>.</p>
+<center>
+<img src="images/toy_linear_separable_dataset_2.png" alt="toy_linear_separability" width="450">
+</center>
+<p><br> The loss incurred by this misclassified point is infinite.</p>
+<p><span class="math display">\[-(y\text{ log}(p) + (1-y)\text{ log}(1-p))=1 * \text{log}(0)\]</span></p>
+<p>Thus, diverging weights (<span class="math inline">\(|\theta| \rightarrow \infty\)</span>) occur with <strong>linearly separable</strong> data. “Overconfidence”, as shown here, is a particularly dangerous version of overfitting.</p>
+<section id="regularized-logistic-regression" class="level3" data-number="23.2.1">
+<h3 data-number="23.2.1" class="anchored" data-anchor-id="regularized-logistic-regression"><span class="header-section-number">23.2.1</span> Regularized Logistic Regression</h3>
+<p>To avoid large weights and infinite loss (particularly on linearly separable data), we use regularization. The same principles apply as with linear regression - make sure to standardize your features first.</p>
+<p>For example, <span class="math inline">\(L2\)</span> (Ridge) Logistic Regression takes on the form:</p>
+<p><span class="math display">\[\min_{\theta} -\frac{1}{n} \sum_{i=1}^{n} (y_i \text{log}(\sigma(X_i^T\theta)) + (1-y_i)\text{log}(1-\sigma(X_i^T\theta))) + \lambda \sum_{j=1}^{d} \theta_j^2\]</span></p>
+<p>Now, let us compare the loss functions of un-regularized and regularized logistic regression.</p>
+<center>
+<img src="images/unreg_loss_infinite_argmin.png" alt="unreg_loss" width="450">
+</center>
+<center>
+<img src="images/reg_loss_finite_argmin.png" alt="reg_loss" width="450">
+</center>
+<p>As we can see, <span class="math inline">\(L2\)</span> regularization helps us prevent diverging weights and deters against “overconfidence.”</p>
+<p><code>sklearn</code>’s logistic regression defaults to <span class="math inline">\(L2\)</span> regularization and <code>C=1.0</code>; <code>C</code> is the inverse of <span class="math inline">\(\lambda\)</span>: <span class="math display">\[C = \frac{1}{\lambda}\]</span> Setting <code>C</code> to a large value, for example, <code>C=300.0</code>, results in minimal regularization.</p>
+<pre><code># sklearn defaults
+model = LogisticRegression(penalty = 'l2', C = 1.0, ...)
+model.fit()</code></pre>
+<p>Note that in Data 100, we only use <code>sklearn</code> to fit logistic regression models. There is no closed-form solution to the optimal theta vector, and the gradient is a little messy (see the bonus section below for details).</p>
+<p>From here, the <code>.predict</code> function returns the predicted class <span class="math inline">\(\hat y\)</span> of the point. In the simple binary case where the threshold is 0.5,</p>
+<p><span class="math display">\[\hat y = \begin{cases}
+        1, &amp; P(Y=1|x) \ge 0.5\\
+        0, &amp; \text{otherwise }
+    \end{cases}\]</span></p>
+</section>
+</section>
+<section id="performance-metrics" class="level2" data-number="23.3">
+<h2 data-number="23.3" class="anchored" data-anchor-id="performance-metrics"><span class="header-section-number">23.3</span> Performance Metrics</h2>
+<p>You might be thinking, if we’ve already introduced cross-entropy loss, why do we need additional ways of assessing how well our models perform? In linear regression, we made numerical predictions and used a loss function to determine how “good” these predictions were. In logistic regression, our ultimate goal is to classify data – we are much more concerned with whether or not each datapoint was assigned the correct class using the decision rule. As such, we are interested in the <em>quality</em> of classifications, not the predicted probabilities.</p>
+<p>The most basic evaluation metric is <strong>accuracy</strong>, that is, the proportion of correctly classified points.</p>
+<p><span class="math display">\[\text{accuracy} = \frac{\# \text{ of points classified correctly}}{\# \text{ of total points}}\]</span></p>
+<p>Translated to code:</p>
+<pre><code>def accuracy(X, Y):
+    return np.mean(model.predict(X) == Y)
+    
+model.score(X, y) # built-in accuracy function</code></pre>
+<p>You can find the <code>sklearn</code> documentation <a href="https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score">here</a>.</p>
+<p>However, accuracy is not always a great metric for classification. To understand why, let’s consider a classification problem with 100 emails where only 5 are truly spam, and the remaining 95 are truly ham. We’ll investigate two models where accuracy is a poor metric.</p>
+<ul>
+<li><strong>Model 1</strong>: Our first model classifies every email as non-spam. The model’s accuracy is high (<span class="math inline">\(\frac{95}{100} = 0.95\)</span>), but it doesn’t detect any spam emails. Despite the high accuracy, this is a bad model.</li>
+<li><strong>Model 2</strong>: The second model classifies every email as spam. The accuracy is low (<span class="math inline">\(\frac{5}{100} = 0.05\)</span>), but the model correctly labels every spam email. Unfortunately, it also misclassifies every non-spam email.</li>
+</ul>
+<p>As this example illustrates, accuracy is not always a good metric for classification, particularly when your data could exhibit class imbalance (e.g., very few 1’s compared to 0’s).</p>
+<section id="types-of-classification" class="level3" data-number="23.3.1">
+<h3 data-number="23.3.1" class="anchored" data-anchor-id="types-of-classification"><span class="header-section-number">23.3.1</span> Types of Classification</h3>
+<p>There are 4 different different classifications that our model might make:</p>
+<ol type="1">
+<li><strong>True positive</strong>: correctly classify a positive point as being positive (<span class="math inline">\(y=1\)</span> and <span class="math inline">\(\hat{y}=1\)</span>)</li>
+<li><strong>True negative</strong>: correctly classify a negative point as being negative (<span class="math inline">\(y=0\)</span> and <span class="math inline">\(\hat{y}=0\)</span>)</li>
+<li><strong>False positive</strong>: incorrectly classify a negative point as being positive (<span class="math inline">\(y=0\)</span> and <span class="math inline">\(\hat{y}=1\)</span>)</li>
+<li><strong>False negative</strong>: incorrectly classify a positive point as being negative (<span class="math inline">\(y=1\)</span> and <span class="math inline">\(\hat{y}=0\)</span>)</li>
+</ol>
+<p>These classifications can be concisely summarized in a <strong>confusion matrix</strong>.</p>
+<center>
+<img src="images/confusion_matrix.png" alt="confusion_matrix" width="450">
+</center>
+<p>An easy way to remember this terminology is as follows:</p>
+<ol type="1">
+<li>Look at the second word in the phrase. <em>Positive</em> means a prediction of 1. <em>Negative</em> means a prediction of 0.</li>
+<li>Look at the first word in the phrase. <em>True</em> means our prediction was correct. <em>False</em> means it was incorrect.</li>
+</ol>
+<p>We can now write the accuracy calculation as <span class="math display">\[\text{accuracy} = \frac{TP + TN}{n}\]</span></p>
+<p>In <code>sklearn</code>, we use the following syntax to plot a confusion matrix:</p>
+<pre><code>from sklearn.metrics import confusion_matrix
+cm = confusion_matrix(Y_true, Y_pred)</code></pre>
+<center>
+<img src="images/confusion_matrix_sklearn.png" alt="confusion_matrix" width="300">
+</center>
+</section>
+<section id="accuracy-precision-and-recall" class="level3" data-number="23.3.2">
+<h3 data-number="23.3.2" class="anchored" data-anchor-id="accuracy-precision-and-recall"><span class="header-section-number">23.3.2</span> Accuracy, Precision, and Recall</h3>
+<p>The purpose of our discussion of the confusion matrix was to motivate better performance metrics for classification problems with class imbalance - namely, precision and recall.</p>
+<p><strong>Precision</strong> is defined as</p>
+<p><span class="math display">\[\text{precision} = \frac{\text{TP}}{\text{TP + FP}}\]</span></p>
+<p>Precision answers the question: “Of all observations that were predicted to be <span class="math inline">\(1\)</span>, what proportion was actually <span class="math inline">\(1\)</span>?” It measures how accurate the classifier is when its predictions are positive.</p>
+<p><strong>Recall</strong> (or <strong>sensitivity</strong>) is defined as</p>
+<p><span class="math display">\[\text{recall} = \frac{\text{TP}}{\text{TP + FN}}\]</span></p>
+<p>Recall aims to answer: “Of all observations that were actually <span class="math inline">\(1\)</span>, what proportion was predicted to be <span class="math inline">\(1\)</span>?” It measures how many positive predictions were missed.</p>
+<p>Here’s a helpful graphic that summarizes our discussion above.</p>
+<center>
+<img src="images/precision_recall_graphic.png" alt="confusion_matrix" width="700">
+</center>
+</section>
+<section id="example-calculation" class="level3" data-number="23.3.3">
+<h3 data-number="23.3.3" class="anchored" data-anchor-id="example-calculation"><span class="header-section-number">23.3.3</span> Example Calculation</h3>
+<p>In this section, we will calculate the accuracy, precision, and recall performance metrics for our earlier spam classification example. As a reminder, we had 100 emails, 5 of which were spam. We designed two models:</p>
+<ul>
+<li>Model 1: Predict that every email is <em>non-spam</em></li>
+<li>Model 2: Predict that every email is <em>spam</em></li>
+</ul>
+<section id="model-1" class="level4" data-number="23.3.3.1">
+<h4 data-number="23.3.3.1" class="anchored" data-anchor-id="model-1"><span class="header-section-number">23.3.3.1</span> Model 1</h4>
+<p>First, let’s begin by creating the confusion matrix.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 27%">
+<col style="width: 27%">
+<col style="width: 38%">
+</colgroup>
+<thead>
+<tr class="header">
+<th></th>
+<th>0</th>
+<th>1</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>0</td>
+<td>True Negative: 95</td>
+<td>False Positive: 0</td>
+</tr>
+<tr class="even">
+<td>1</td>
+<td>False Negative: 5</td>
+<td>True Positive: 0</td>
+</tr>
+</tbody>
+</table>
+<p><span class="math display">\[\text{accuracy} = \frac{95}{100} = 0.95\]</span> <span class="math display">\[\text{precision} = \frac{0}{0 + 0} = \text{undefined}\]</span> <span class="math display">\[\text{recall} = \frac{0}{0 + 5} = 0\]</span></p>
+<p>Notice how our precision is undefined because we never predicted class <span class="math inline">\(1\)</span>. Our recall is 0 for the same reason – the numerator is 0 (we had no positive predictions).</p>
+</section>
+<section id="model-2" class="level4" data-number="23.3.3.2">
+<h4 data-number="23.3.3.2" class="anchored" data-anchor-id="model-2"><span class="header-section-number">23.3.3.2</span> Model 2</h4>
+<p>The confusion matrix for Model 2 is:</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 27%">
+<col style="width: 27%">
+<col style="width: 38%">
+</colgroup>
+<thead>
+<tr class="header">
+<th></th>
+<th>0</th>
+<th>1</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>0</td>
+<td>True Negative: 0</td>
+<td>False Positive: 95</td>
+</tr>
+<tr class="even">
+<td>1</td>
+<td>False Negative: 0</td>
+<td>True Positive: 5</td>
+</tr>
+</tbody>
+</table>
+<p><span class="math display">\[\text{accuracy} = \frac{5}{100} = 0.05\]</span> <span class="math display">\[\text{precision} = \frac{5}{5 + 95} = 0.05\]</span> <span class="math display">\[\text{recall} = \frac{5}{5 + 0} = 1\]</span></p>
+<p>Our precision is low because we have many false positives, and our recall is perfect - we correctly classified all spam emails (we never predicted class <span class="math inline">\(0\)</span>).</p>
+</section>
+</section>
+<section id="precision-vs.-recall" class="level3" data-number="23.3.4">
+<h3 data-number="23.3.4" class="anchored" data-anchor-id="precision-vs.-recall"><span class="header-section-number">23.3.4</span> Precision vs.&nbsp;Recall</h3>
+<p>Precision (<span class="math inline">\(\frac{\text{TP}}{\text{TP} + \textbf{ FP}}\)</span>) penalizes false positives, while recall (<span class="math inline">\(\frac{\text{TP}}{\text{TP} + \textbf{ FN}}\)</span>) penalizes false negatives. In fact, precision and recall are <em>inversely related</em>. This is evident in our second model – we observed a high recall and low precision. Usually, there is a tradeoff in these two (most models can either minimize the number of FP or FN; and in rare cases, both).</p>
+<p>The specific performance metric(s) to prioritize depends on the context. In many medical settings, there might be a much higher cost to missing positive cases. For instance, in our breast cancer example, it is more costly to misclassify malignant tumors (false negatives) than it is to incorrectly classify a benign tumor as malignant (false positives). In the case of the latter, pathologists can conduct further studies to verify malignant tumors. As such, we should minimize the number of false negatives. This is equivalent to maximizing recall.</p>
+</section>
+<section id="three-more-metrics" class="level3" data-number="23.3.5">
+<h3 data-number="23.3.5" class="anchored" data-anchor-id="three-more-metrics"><span class="header-section-number">23.3.5</span> Three More Metrics</h3>
+<p>The <strong>True Positive Rate (TPR)</strong> is defined as</p>
+<p><span class="math display">\[\text{true positive rate} = \frac{\text{TP}}{\text{TP + FN}}\]</span></p>
+<p>You’ll notice this is equivalent to <em>recall</em>. In the context of our spam email classifier, it answers the question: “What proportion of spam did I mark correctly?”. We’d like this to be close to <span class="math inline">\(1\)</span>.</p>
+<p>The <strong>True Negative Rate (TNR)</strong> is defined as</p>
+<p><span class="math display">\[\text{true negative rate} = \frac{\text{TN}}{\text{TN + FP}}\]</span></p>
+<p>Another word for TNR is <em>specificity</em>. This answers the question: “What proportion of ham did I mark correctly?”. We’d like this to be close to <span class="math inline">\(1\)</span>.</p>
+<p>The <strong>False Positive Rate (FPR)</strong> is defined as</p>
+<p><span class="math display">\[\text{false positive rate} = \frac{\text{FP}}{\text{FP + TN}}\]</span></p>
+<p>FPR is equal to <em>1 - specificity</em>, or <em>1 - TNR</em>. This answers the question: “What proportion of regular email did I mark as spam?”. We’d like this to be close to <span class="math inline">\(0\)</span>.</p>
+<p>As we increase threshold <span class="math inline">\(T\)</span>, both TPR and FPR decrease. We’ve plotted this relationship below for some model on a <code>toy</code> dataset.</p>
+<center>
+<img src="images/tpr_fpr.png" alt="tpr_fpr" width="800">
+</center>
+</section>
+</section>
+<section id="adjusting-the-classification-threshold" class="level2" data-number="23.4">
+<h2 data-number="23.4" class="anchored" data-anchor-id="adjusting-the-classification-threshold"><span class="header-section-number">23.4</span> Adjusting the Classification Threshold</h2>
+<p>One way to minimize the number of FP vs.&nbsp;FN (equivalently, maximizing precision vs.&nbsp;recall) is by adjusting the classification threshold <span class="math inline">\(T\)</span>.</p>
+<p><span class="math display">\[\hat y = \begin{cases}
+        1, &amp; P(Y=1|x) \ge T\\
+        0, &amp; \text{otherwise }
+    \end{cases}\]</span></p>
+<p>The default threshold in <code>sklearn</code> is <span class="math inline">\(T = 0.5\)</span>. As we increase the threshold <span class="math inline">\(T\)</span>, we “raise the standard” of how confident our classifier needs to be to predict 1 (i.e., “positive”).</p>
+<center>
+<img src="images/varying_threshold.png" alt="varying_threshold" width="700">
+</center>
+<p>As you may notice, the choice of threshold <span class="math inline">\(T\)</span> impacts our classifier’s performance.</p>
+<ul>
+<li>High <span class="math inline">\(T\)</span>: Most predictions are <span class="math inline">\(0\)</span>.
+<ul>
+<li>Lots of false negatives</li>
+<li>Fewer false positives</li>
+</ul></li>
+<li>Low <span class="math inline">\(T\)</span>: Most predictions are <span class="math inline">\(1\)</span>.
+<ul>
+<li>Lots of false positives</li>
+<li>Fewer false negatives</li>
+</ul></li>
+</ul>
+<p>In fact, we can choose a threshold <span class="math inline">\(T\)</span> based on our desired number, or proportion, of false positives and false negatives. We can do so using a few different tools. We’ll touch on two of the most important ones in Data 100.</p>
+<ol type="1">
+<li>Precision-Recall Curve (PR Curve)</li>
+<li>“Receiver Operating Characteristic” Curve (ROC Curve)</li>
+</ol>
+<section id="precision-recall-curves" class="level3" data-number="23.4.1">
+<h3 data-number="23.4.1" class="anchored" data-anchor-id="precision-recall-curves"><span class="header-section-number">23.4.1</span> Precision-Recall Curves</h3>
+<p>A <strong>Precision-Recall Curve (PR Curve)</strong> is an alternative to the ROC curve that displays the relationship between precision and recall for various threshold values. In this curve, we test out many different possible thresholds and for each one we compute the precision and recall of the classifier.</p>
+<p>Let’s first consider how precision and recall change as a function of the threshold <span class="math inline">\(T\)</span>. We know this quite well from earlier – precision will generally increase, and recall will decrease.</p>
+<center>
+<img src="images/precision-recall-thresh.png" alt="precision-recall-thresh" width="650">
+</center>
+<p>Displayed below is the PR Curve for the same <code>toy</code> dataset. Notice how threshold values increase as we move to the left.</p>
+<center>
+<img src="images/pr_curve_thresholds.png" alt="pr_curve_thresholds" width="600">
+</center>
+<p>Once again, the perfect classifier will resemble the orange curve, this time, facing the opposite direction.</p>
+<center>
+<img src="images/pr_curve_perfect.png" alt="pr_curve_perfect" width="600">
+</center>
+<p>We want our PR curve to be as close to the “top right” of this graph as possible. Again, we use the AUC to determine “closeness”, with the perfect classifier exhibiting an AUC = 1 (and the worst with an AUC = 0.5).</p>
+</section>
+<section id="the-roc-curve" class="level3" data-number="23.4.2">
+<h3 data-number="23.4.2" class="anchored" data-anchor-id="the-roc-curve"><span class="header-section-number">23.4.2</span> The ROC Curve</h3>
+<p>The “Receiver Operating Characteristic” Curve (<strong>ROC Curve</strong>) plots the tradeoff between FPR and TPR. Notice how the far-left of the curve corresponds to higher threshold <span class="math inline">\(T\)</span> values. At lower thresholds, the FPR and TPR are both high as there are many positive predictions while at higher thresholds the FPR and TPR are both low as there are fewer positive predictions.</p>
+<center>
+<img src="images/roc_curve.png" alt="roc_curve" width="600">
+</center>
+<p>The “perfect” classifier is the one that has a TPR of 1, and FPR of 0. This is achieved at the top-left of the plot below. More generally, it’s ROC curve resembles the curve in orange.</p>
+<center>
+<img src="images/roc_curve_perfect.png" alt="roc_curve_perfect" width="600">
+</center>
+<p>We want our model to be as close to this orange curve as possible. How do we quantify “closeness”?</p>
+<p>We can compute the <strong>area under curve (AUC)</strong> of the ROC curve. Notice how the perfect classifier has an AUC = 1. The closer our model’s AUC is to 1, the better it is.</p>
+<section id="extra-what-is-the-worst-auc-and-why-is-it-0.5" class="level4" data-number="23.4.2.1">
+<h4 data-number="23.4.2.1" class="anchored" data-anchor-id="extra-what-is-the-worst-auc-and-why-is-it-0.5"><span class="header-section-number">23.4.2.1</span> (Extra) What is the “worst” AUC, and why is it 0.5?</h4>
+<p>On the other hand, a terrible model will have an AUC closer to 0.5. Random predictors randomly predict <span class="math inline">\(P(Y = 1 | x)\)</span> to be uniformly between 0 and 1. This indicates the classifier is not able to distinguish between positive and negative classes, and thus, randomly predicts one of the two.</p>
+<center>
+<img src="images/roc_curve_worst_predictor.png" alt="roc_curve_worst_predictor" width="700">
+</center>
+<p>We can also illustrate this by comparing different thresholds and seeing their points on the ROC curve.</p>
+<center>
+<img src="images/roc_curve_worse_predictor_differing_T.png" alt="roc_curve_worse_predictor_differing_T" width="700">
+</center>
+</section>
+</section>
+</section>
+<section id="bonus-gradient-descent-for-logistic-regression" class="level2" data-number="23.5">
+<h2 data-number="23.5" class="anchored" data-anchor-id="bonus-gradient-descent-for-logistic-regression"><span class="header-section-number">23.5</span> (Bonus) Gradient Descent for Logistic Regression</h2>
+<p>Let’s define the following terms: <span class="math display">\[
+\begin{align}
+t_i &amp;= \phi(x_i)^T \theta \\
+p_i &amp;= \sigma(t_i) \\
+t_i &amp;= \log(\frac{p_i}{1 - p_i}) \\
+1 - \sigma(t_i) &amp;= \sigma(-t_i) \\
+\frac{d}{dt}  \sigma(t) &amp;=  \sigma(t) \sigma(-t)
+\end{align}
+\]</span></p>
+<p>Now, we can simplify the cross-entropy loss <span class="math display">\[
+\begin{align}
+y_i \log(p_i) + (1 - y_i) \log(1 - p_i) &amp;= y_i \log(\frac{p_i}{1 - p_i}) + \log(1 - p_i) \\
+&amp;= y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta))
+\end{align}
+\]</span></p>
+<p>Hence, the optimal <span class="math inline">\(\hat{\theta}\)</span> is <span class="math display">\[\text{argmin}_{\theta} - \frac{1}{n} \sum_{i=1}^n (y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta)))\]</span></p>
+<p>We want to minimize <span class="math display">\[L(\theta) = - \frac{1}{n} \sum_{i=1}^n (y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta)))\]</span></p>
+<p>So we take the derivative <span class="math display">\[
+\begin{align}
+\triangledown_{\theta} L(\theta) &amp;= - \frac{1}{n} \sum_{i=1}^n \triangledown_{\theta} y_i \phi(x_i)^T + \triangledown_{\theta} \log(\sigma(-\phi(x_i)^T \theta)) \\
+&amp;= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \triangledown_{\theta} \log(\sigma(-\phi(x_i)^T \theta)) \\
+&amp;= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \frac{1}{\sigma(-\phi(x_i)^T \theta)} \triangledown_{\theta} \sigma(-\phi(x_i)^T \theta) \\
+&amp;= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \frac{\sigma(-\phi(x_i)^T \theta)}{\sigma(-\phi(x_i)^T \theta)} \sigma(\phi(x_i)^T \theta)\triangledown_{\theta} \sigma(-\phi(x_i)^T \theta) \\
+&amp;= - \frac{1}{n} \sum_{i=1}^n (y_i - \sigma(\phi(x_i)^T \theta)\phi(x_i))
+\end{align}
+\]</span></p>
+<p>Setting the derivative equal to 0 and solving for <span class="math inline">\(\hat{\theta}\)</span>, we find that there’s no general analytic solution. Therefore, we must solve using numeric methods.</p>
+<section id="gradient-descent-update-rule" class="level3" data-number="23.5.1">
+<h3 data-number="23.5.1" class="anchored" data-anchor-id="gradient-descent-update-rule"><span class="header-section-number">23.5.1</span> Gradient Descent Update Rule</h3>
+<p><span class="math display">\[\theta^{(0)} \leftarrow \text{initial vector (random, zeros, ...)} \]</span></p>
+<p>For <span class="math inline">\(\tau\)</span> from 0 to convergence: <span class="math display">\[ \theta^{(\tau + 1)} \leftarrow \theta^{(\tau)} - \rho(\tau)\left( \frac{1}{n} \sum_{i=1}^n \triangledown_{\theta} L_i(\theta) \mid_{\theta = \theta^{(\tau)}}\right) \]</span></p>
+</section>
+<section id="stochastic-gradient-descent-update-rule" class="level3" data-number="23.5.2">
+<h3 data-number="23.5.2" class="anchored" data-anchor-id="stochastic-gradient-descent-update-rule"><span class="header-section-number">23.5.2</span> Stochastic Gradient Descent Update Rule</h3>
+<p><span class="math display">\[\theta^{(0)} \leftarrow \text{initial vector (random, zeros, ...)} \]</span></p>
+<p>For <span class="math inline">\(\tau\)</span> from 0 to convergence, let <span class="math inline">\(B\)</span> ~ <span class="math inline">\(\text{Random subset of indices}\)</span>. <span class="math display">\[ \theta^{(\tau + 1)} \leftarrow \theta^{(\tau)} - \rho(\tau)\left( \frac{1}{|B|} \sum_{i \in B} \triangledown_{\theta} L_i(\theta) \mid_{\theta = \theta^{(\tau)}}\right) \]</span></p>
+
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../logistic_regression_1/logistic_reg_1.html" class="pagination-link" aria-label="Logistic Regression I">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../pca_1/pca_1.html" class="pagination-link" aria-label="PCA I">
+        <span class="nav-page-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/ols/images/columns.png b/docs/ols/images/columns.png
new file mode 100644
index 000000000..1bbb36d1d
Binary files /dev/null and b/docs/ols/images/columns.png differ
diff --git a/docs/ols/images/design_matrix.png b/docs/ols/images/design_matrix.png
new file mode 100644
index 000000000..2f098eca5
Binary files /dev/null and b/docs/ols/images/design_matrix.png differ
diff --git a/docs/ols/images/matmul1.png b/docs/ols/images/matmul1.png
new file mode 100644
index 000000000..9443c4cca
Binary files /dev/null and b/docs/ols/images/matmul1.png differ
diff --git a/docs/ols/images/matmul2.png b/docs/ols/images/matmul2.png
new file mode 100644
index 000000000..ac184baee
Binary files /dev/null and b/docs/ols/images/matmul2.png differ
diff --git a/docs/ols/images/observation.png b/docs/ols/images/observation.png
new file mode 100644
index 000000000..c943fc80c
Binary files /dev/null and b/docs/ols/images/observation.png differ
diff --git a/docs/ols/images/residual.png b/docs/ols/images/residual.png
new file mode 100644
index 000000000..c35b336e0
Binary files /dev/null and b/docs/ols/images/residual.png differ
diff --git a/docs/ols/images/residual_plot.png b/docs/ols/images/residual_plot.png
new file mode 100644
index 000000000..9a54148fa
Binary files /dev/null and b/docs/ols/images/residual_plot.png differ
diff --git a/docs/ols/images/row_col.png b/docs/ols/images/row_col.png
new file mode 100644
index 000000000..4a387f5ee
Binary files /dev/null and b/docs/ols/images/row_col.png differ
diff --git a/docs/ols/images/span.png b/docs/ols/images/span.png
new file mode 100644
index 000000000..876e08337
Binary files /dev/null and b/docs/ols/images/span.png differ
diff --git a/docs/ols/ols.html b/docs/ols/ols.html
new file mode 100644
index 000000000..5164dfb0a
--- /dev/null
+++ b/docs/ols/ols.html
@@ -0,0 +1,2157 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>12&nbsp; Ordinary Least Squares – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../gradient_descent/gradient_descent.html" rel="next">
+<link href="../constant_model_loss_transformations/loss_transformations.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ols/ols.html"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Ordinary Least Squares</h2>
+   
+  <ul>
+  <li><a href="#ols-problem-formulation" id="toc-ols-problem-formulation" class="nav-link active" data-scroll-target="#ols-problem-formulation"><span class="header-section-number">12.1</span> OLS Problem Formulation</a>
+  <ul>
+  <li><a href="#multiple-linear-regression" id="toc-multiple-linear-regression" class="nav-link" data-scroll-target="#multiple-linear-regression"><span class="header-section-number">12.1.1</span> Multiple Linear Regression</a></li>
+  <li><a href="#linear-algebra-approach" id="toc-linear-algebra-approach" class="nav-link" data-scroll-target="#linear-algebra-approach"><span class="header-section-number">12.1.2</span> Linear Algebra Approach</a></li>
+  <li><a href="#mean-squared-error" id="toc-mean-squared-error" class="nav-link" data-scroll-target="#mean-squared-error"><span class="header-section-number">12.1.3</span> Mean Squared Error</a></li>
+  <li><a href="#a-note-on-terminology-for-multiple-linear-regression" id="toc-a-note-on-terminology-for-multiple-linear-regression" class="nav-link" data-scroll-target="#a-note-on-terminology-for-multiple-linear-regression"><span class="header-section-number">12.1.4</span> A Note on Terminology for Multiple Linear Regression</a></li>
+  </ul></li>
+  <li><a href="#geometric-derivation" id="toc-geometric-derivation" class="nav-link" data-scroll-target="#geometric-derivation"><span class="header-section-number">12.2</span> Geometric Derivation</a></li>
+  <li><a href="#evaluating-model-performance" id="toc-evaluating-model-performance" class="nav-link" data-scroll-target="#evaluating-model-performance"><span class="header-section-number">12.3</span> Evaluating Model Performance</a>
+  <ul>
+  <li><a href="#rmse" id="toc-rmse" class="nav-link" data-scroll-target="#rmse"><span class="header-section-number">12.3.1</span> RMSE</a></li>
+  <li><a href="#residual-plots" id="toc-residual-plots" class="nav-link" data-scroll-target="#residual-plots"><span class="header-section-number">12.3.2</span> Residual Plots</a></li>
+  <li><a href="#multiple-r2" id="toc-multiple-r2" class="nav-link" data-scroll-target="#multiple-r2"><span class="header-section-number">12.3.3</span> Multiple <span class="math inline">\(R^2\)</span></a></li>
+  </ul></li>
+  <li><a href="#ols-properties" id="toc-ols-properties" class="nav-link" data-scroll-target="#ols-properties"><span class="header-section-number">12.4</span> OLS Properties</a></li>
+  <li><a href="#bonus-uniqueness-of-the-solution" id="toc-bonus-uniqueness-of-the-solution" class="nav-link" data-scroll-target="#bonus-uniqueness-of-the-solution"><span class="header-section-number">12.5</span> Bonus: Uniqueness of the Solution</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Define linearity with respect to a vector of parameters <span class="math inline">\(\theta\)</span>.</li>
+<li>Understand the use of matrix notation to express multiple linear regression.</li>
+<li>Interpret ordinary least squares as the minimization of the norm of the residual vector.</li>
+<li>Compute performance metrics for multiple linear regression.</li>
+</ul>
+</div>
+</div>
+</div>
+<p>We’ve now spent a number of lectures exploring how to build effective models – we introduced the SLR and constant models, selected cost functions to suit our modeling task, and applied transformations to improve the linear fit.</p>
+<p>Throughout all of this, we considered models of one feature (<span class="math inline">\(\hat{y}_i = \theta_0 + \theta_1 x_i\)</span>) or zero features (<span class="math inline">\(\hat{y}_i = \theta_0\)</span>). As data scientists, we usually have access to datasets containing <em>many</em> features. To make the best models we can, it will be beneficial to consider all of the variables available to us as inputs to a model, rather than just one. In today’s lecture, we’ll introduce <strong>multiple linear regression</strong> as a framework to incorporate multiple features into a model. We will also learn how to accelerate the modeling process – specifically, we’ll see how linear algebra offers us a powerful set of tools for understanding model performance.</p>
+<section id="ols-problem-formulation" class="level2" data-number="12.1">
+<h2 data-number="12.1" class="anchored" data-anchor-id="ols-problem-formulation"><span class="header-section-number">12.1</span> OLS Problem Formulation</h2>
+<section id="multiple-linear-regression" class="level3" data-number="12.1.1">
+<h3 data-number="12.1.1" class="anchored" data-anchor-id="multiple-linear-regression"><span class="header-section-number">12.1.1</span> Multiple Linear Regression</h3>
+<p>Multiple linear regression is an extension of simple linear regression that adds additional features to the model. The multiple linear regression model takes the form:</p>
+<p><span class="math display">\[\hat{y} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:...\:+\:\theta_p x_{p}\]</span></p>
+<p>Our predicted value of <span class="math inline">\(y\)</span>, <span class="math inline">\(\hat{y}\)</span>, is a linear combination of the single <strong>observations</strong> (features), <span class="math inline">\(x_i\)</span>, and the parameters, <span class="math inline">\(\theta_i\)</span>.</p>
+<p>We can explore this idea further by looking at a dataset containing aggregate per-player data from the 2018-19 NBA season, downloaded from <a href="https://www.kaggle.com/schmadam97/nba-regular-season-stats-20182019">Kaggle</a>.</p>
+<div id="fa6ddcf7" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>nba <span class="op">=</span> pd.read_csv(<span class="st">'data/nba18-19.csv'</span>, index_col<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>nba.index.name <span class="op">=</span> <span class="va">None</span> <span class="co"># Drops name of index (players are ordered by rank)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="72553886" class="cell" data-execution_count="2">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>nba.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="2">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Player</th>
+<th data-quarto-table-cell-role="th">Pos</th>
+<th data-quarto-table-cell-role="th">Age</th>
+<th data-quarto-table-cell-role="th">Tm</th>
+<th data-quarto-table-cell-role="th">G</th>
+<th data-quarto-table-cell-role="th">GS</th>
+<th data-quarto-table-cell-role="th">MP</th>
+<th data-quarto-table-cell-role="th">FG</th>
+<th data-quarto-table-cell-role="th">FGA</th>
+<th data-quarto-table-cell-role="th">FG%</th>
+<th data-quarto-table-cell-role="th">...</th>
+<th data-quarto-table-cell-role="th">FT%</th>
+<th data-quarto-table-cell-role="th">ORB</th>
+<th data-quarto-table-cell-role="th">DRB</th>
+<th data-quarto-table-cell-role="th">TRB</th>
+<th data-quarto-table-cell-role="th">AST</th>
+<th data-quarto-table-cell-role="th">STL</th>
+<th data-quarto-table-cell-role="th">BLK</th>
+<th data-quarto-table-cell-role="th">TOV</th>
+<th data-quarto-table-cell-role="th">PF</th>
+<th data-quarto-table-cell-role="th">PTS</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Álex Abrines\abrinal01</td>
+<td>SG</td>
+<td>25</td>
+<td>OKC</td>
+<td>31</td>
+<td>2</td>
+<td>19.0</td>
+<td>1.8</td>
+<td>5.1</td>
+<td>0.357</td>
+<td>...</td>
+<td>0.923</td>
+<td>0.2</td>
+<td>1.4</td>
+<td>1.5</td>
+<td>0.6</td>
+<td>0.5</td>
+<td>0.2</td>
+<td>0.5</td>
+<td>1.7</td>
+<td>5.3</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Quincy Acy\acyqu01</td>
+<td>PF</td>
+<td>28</td>
+<td>PHO</td>
+<td>10</td>
+<td>0</td>
+<td>12.3</td>
+<td>0.4</td>
+<td>1.8</td>
+<td>0.222</td>
+<td>...</td>
+<td>0.700</td>
+<td>0.3</td>
+<td>2.2</td>
+<td>2.5</td>
+<td>0.8</td>
+<td>0.1</td>
+<td>0.4</td>
+<td>0.4</td>
+<td>2.4</td>
+<td>1.7</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Jaylen Adams\adamsja01</td>
+<td>PG</td>
+<td>22</td>
+<td>ATL</td>
+<td>34</td>
+<td>1</td>
+<td>12.6</td>
+<td>1.1</td>
+<td>3.2</td>
+<td>0.345</td>
+<td>...</td>
+<td>0.778</td>
+<td>0.3</td>
+<td>1.4</td>
+<td>1.8</td>
+<td>1.9</td>
+<td>0.4</td>
+<td>0.1</td>
+<td>0.8</td>
+<td>1.3</td>
+<td>3.2</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">4</td>
+<td>Steven Adams\adamsst01</td>
+<td>C</td>
+<td>25</td>
+<td>OKC</td>
+<td>80</td>
+<td>80</td>
+<td>33.4</td>
+<td>6.0</td>
+<td>10.1</td>
+<td>0.595</td>
+<td>...</td>
+<td>0.500</td>
+<td>4.9</td>
+<td>4.6</td>
+<td>9.5</td>
+<td>1.6</td>
+<td>1.5</td>
+<td>1.0</td>
+<td>1.7</td>
+<td>2.6</td>
+<td>13.9</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">5</td>
+<td>Bam Adebayo\adebaba01</td>
+<td>C</td>
+<td>21</td>
+<td>MIA</td>
+<td>82</td>
+<td>28</td>
+<td>23.3</td>
+<td>3.4</td>
+<td>5.9</td>
+<td>0.576</td>
+<td>...</td>
+<td>0.735</td>
+<td>2.0</td>
+<td>5.3</td>
+<td>7.3</td>
+<td>2.2</td>
+<td>0.9</td>
+<td>0.8</td>
+<td>1.5</td>
+<td>2.5</td>
+<td>8.9</td>
+</tr>
+</tbody>
+</table>
+
+<p>5 rows × 29 columns</p>
+</div>
+</div>
+</div>
+<p>Let’s say we are interested in predicting the number of points (<code>PTS</code>) an athlete will score in a basketball game this season.</p>
+<p>Suppose we want to fit a linear model by using some characteristics, or <strong>features</strong> of a player. Specifically, we’ll focus on field goals, assists, and 3-point attempts.</p>
+<ul>
+<li><code>FG</code>, the average number of (2-point) field goals per game</li>
+<li><code>AST</code>, the average number of assists per game</li>
+<li><code>3PA</code>, the average number of 3-point field goals attempted per game</li>
+</ul>
+<div id="4f5bfe18" class="cell" data-execution_count="3">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>nba[[<span class="st">'FG'</span>, <span class="st">'AST'</span>, <span class="st">'3PA'</span>, <span class="st">'PTS'</span>]].head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">FG</th>
+<th data-quarto-table-cell-role="th">AST</th>
+<th data-quarto-table-cell-role="th">3PA</th>
+<th data-quarto-table-cell-role="th">PTS</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1.8</td>
+<td>0.6</td>
+<td>4.1</td>
+<td>5.3</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">2</td>
+<td>0.4</td>
+<td>0.8</td>
+<td>1.5</td>
+<td>1.7</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1.1</td>
+<td>1.9</td>
+<td>2.2</td>
+<td>3.2</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">4</td>
+<td>6.0</td>
+<td>1.6</td>
+<td>0.0</td>
+<td>13.9</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">5</td>
+<td>3.4</td>
+<td>2.2</td>
+<td>0.2</td>
+<td>8.9</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Because we are now dealing with many parameter values, we’ve collected them all into a <strong>parameter vector</strong> with dimensions <span class="math inline">\((p+1) \times 1\)</span> to keep things tidy. Remember that <span class="math inline">\(p\)</span> represents the number of features we have (in this case, 3).</p>
+<p><span class="math display">\[\theta = \begin{bmatrix}
+           \theta_{0} \\
+           \theta_{1} \\
+           \vdots \\
+           \theta_{p}
+         \end{bmatrix}\]</span></p>
+<p>We are working with two vectors here: a row vector representing the observed data, and a column vector containing the model parameters. The multiple linear regression model is <strong>equivalent to the dot (scalar) product of the observation vector and parameter vector</strong>.</p>
+<p><span class="math display">\[[1,\:x_{1},\:x_{2},\:x_{3},\:...,\:x_{p}] \theta = [1,\:x_{1},\:x_{2},\:x_{3},\:...,\:x_{p}] \begin{bmatrix}
+           \theta_{0} \\
+           \theta_{1} \\
+           \vdots \\
+           \theta_{p}
+         \end{bmatrix} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:...\:+\:\theta_p x_{p}\]</span></p>
+<p>Notice that we have inserted 1 as the first value in the observation vector. When the dot product is computed, this 1 will be multiplied with <span class="math inline">\(\theta_0\)</span> to give the intercept of the regression model. We call this 1 entry the <strong>intercept</strong> or <strong>bias</strong> term.</p>
+<p>Given that we have three features here, we can express this model as: <span class="math display">\[\hat{y} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:\theta_3 x_{3}\]</span></p>
+<p>Our features are represented by <span class="math inline">\(x_1\)</span> (<code>FG</code>), <span class="math inline">\(x_2\)</span> (<code>AST</code>), and <span class="math inline">\(x_3\)</span> (<code>3PA</code>) with each having correpsonding parameters, <span class="math inline">\(\theta_1\)</span>, <span class="math inline">\(\theta_2\)</span>, and <span class="math inline">\(\theta_3\)</span>.</p>
+<p>In statistics, this model + loss is called <strong>Ordinary Least Squares (OLS)</strong>. The solution to OLS is the minimizing loss for parameters <span class="math inline">\(\hat{\theta}\)</span>, also called the <strong>least squares estimate</strong>.</p>
+</section>
+<section id="linear-algebra-approach" class="level3" data-number="12.1.2">
+<h3 data-number="12.1.2" class="anchored" data-anchor-id="linear-algebra-approach"><span class="header-section-number">12.1.2</span> Linear Algebra Approach</h3>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-2-contents" aria-controls="callout-2" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Linear Algebra Review: Vector Dot Product
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-2" class="callout-2-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<p>The <strong>dot product (or inner product)</strong> is a vector operation that:</p>
+<ul>
+<li>Can only be carried out on two vectors of the <strong>same length</strong></li>
+<li>Sums up the products of the corresponding entries of the two vectors</li>
+<li>Returns a single number</li>
+</ul>
+<p>For example, let <span class="math display">\[
+\begin{align}
+\vec{u} = \begin{bmatrix}1 \\ 2 \\ 3\end{bmatrix}, \vec{v} = \begin{bmatrix}1 \\ 1 \\ 1\end{bmatrix}
+\end{align}
+\]</span></p>
+<p>The dot product between <span class="math inline">\(\vec{u}\)</span> and <span class="math inline">\(\vec{v}\)</span> is <span class="math display">\[
+\begin{align}
+\vec{u} \cdot \vec{v} &amp;= \vec{u}^T \vec{v} = \vec{v}^T \vec{u} \\
+  &amp;= 1 \cdot 1 + 2 \cdot 1 + 3 \cdot 1 \\
+  &amp;= 6
+\end{align}
+\]</span></p>
+<p>While not in scope, note that we can also interpret the dot product geometrically:</p>
+<ul>
+<li>It is the product of three things: the <strong>magnitude</strong> of both vectors, and the <strong>cosine</strong> of the angles between them: <span class="math display">\[\vec{u} \cdot \vec{v} = ||\vec{u}|| \cdot ||\vec{v}|| \cdot {cos \theta}\]</span></li>
+</ul>
+</div>
+</div>
+</div>
+<p>We now know how to generate a single prediction from multiple observed features. Data scientists usually work at scale – that is, they want to build models that can produce many predictions, all at once. The vector notation we introduced above gives us a hint on how we can expedite multiple linear regression. We want to use the tools of linear algebra.</p>
+<p>Let’s think about how we can apply what we did above. To accommodate for the fact that we’re considering several feature variables, we’ll adjust our notation slightly. Each observation can now be thought of as a row vector with an entry for each of <span class="math inline">\(p\)</span> features.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/observation.png" alt="observation" width="550">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>To make a prediction from the <em>first</em> observation in the data, we take the dot product of the parameter vector and <em>first</em> observation vector. To make a prediction from the <em>second</em> observation, we would repeat this process to find the dot product of the parameter vector and the <em>second</em> observation vector. If we wanted to find the model predictions for each observation in the dataset, we’d repeat this process for all <span class="math inline">\(n\)</span> observations in the data.</p>
+<p><span class="math display">\[\hat{y}_1 = \theta_0 + \theta_1 x_{11} + \theta_2 x_{12} + ... + \theta_p x_{1p} = [1,\:x_{11},\:x_{12},\:x_{13},\:...,\:x_{1p}] \theta\]</span> <span class="math display">\[\hat{y}_2 = \theta_0 + \theta_1 x_{21} + \theta_2 x_{22} + ... + \theta_p x_{2p} = [1,\:x_{21},\:x_{22},\:x_{23},\:...,\:x_{2p}] \theta\]</span> <span class="math display">\[\vdots\]</span> <span class="math display">\[\hat{y}_n = \theta_0 + \theta_1 x_{n1} + \theta_2 x_{n2} + ... + \theta_p x_{np} = [1,\:x_{n1},\:x_{n2},\:x_{n3},\:...,\:x_{np}] \theta\]</span></p>
+<p>Our observed data is represented by <span class="math inline">\(n\)</span> row vectors, each with dimension <span class="math inline">\((p+1)\)</span>. We can collect them all into a single matrix, which we call <span class="math inline">\(\mathbb{X}\)</span>.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/design_matrix.png" alt="design_matrix" width="400">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>The matrix <span class="math inline">\(\mathbb{X}\)</span> is known as the <strong>design matrix</strong>. It contains all observed data for each of our <span class="math inline">\(p\)</span> features, where each <strong>row</strong> corresponds to one <strong>observation</strong>, and each <strong>column</strong> corresponds to a <strong>feature</strong>. It often (but not always) contains an additional column of all ones to represent the <strong>intercept</strong> or <strong>bias column</strong>.</p>
+<p>To review what is happening in the design matrix: each row represents a single observation. For example, a student in Data 100. Each column represents a feature. For example, the ages of students in Data 100. This convention allows us to easily transfer our previous work in DataFrames over to this new linear algebra perspective.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/row_col.png" alt="row_col" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>The multiple linear regression model can then be restated in terms of matrices: <span class="math display">\[
+\Large
+\mathbb{\hat{Y}} = \mathbb{X} \theta
+\]</span></p>
+<p>Here, <span class="math inline">\(\mathbb{\hat{Y}}\)</span> is the <strong>prediction vector</strong> with <span class="math inline">\(n\)</span> elements (<span class="math inline">\(\mathbb{\hat{Y}} \in \mathbb{R}^{n}\)</span>); it contains the prediction made by the model for each of the <span class="math inline">\(n\)</span> input observations. <span class="math inline">\(\mathbb{X}\)</span> is the <strong>design matrix</strong> with dimensions <span class="math inline">\(\mathbb{X} \in \mathbb{R}^{n \times (p + 1)}\)</span>, and <span class="math inline">\(\theta\)</span> is the <strong>parameter vector</strong> with dimensions <span class="math inline">\(\theta \in \mathbb{R}^{(p + 1)}\)</span>. Note that our <strong>true output</strong> <span class="math inline">\(\mathbb{Y}\)</span> is also a vector with <span class="math inline">\(n\)</span> elements (<span class="math inline">\(\mathbb{Y} \in \mathbb{R}^{n}\)</span>).</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled" data-collaps="false">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Linear Algebra Review: Linearity
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>An expression is <strong>linear in <span class="math inline">\(\theta\)</span></strong> (a set of parameters) if it is a linear combination of the elements of the set. Checking if an expression can separate into a matrix product of two terms – a <strong>vector of <span class="math inline">\(\theta\)</span></strong> s, and a matrix/vector <strong>not involving <span class="math inline">\(\theta\)</span></strong> – is a good indicator of linearity.</p>
+<p>For example, consider the vector <span class="math inline">\(\theta = [\theta_0, \theta_1, \theta_2]\)</span></p>
+<ul>
+<li><span class="math inline">\(\hat{y} = \theta_0 + 2\theta_1 + 3\theta_2\)</span> is linear in theta, and we can separate it into a matrix product of two terms:</li>
+</ul>
+<p><span class="math display">\[\hat{y} = \begin{bmatrix} 1 \space 2 \space 3 \end{bmatrix} \begin{bmatrix} \theta_0 \\ \theta_1 \\ \theta_2 \end{bmatrix}\]</span></p>
+<ul>
+<li><span class="math inline">\(\hat{y} = \theta_0\theta_1 + 2\theta_1^2 + 3log(\theta_2)\)</span> is <em>not</em> linear in theta, as the <span class="math inline">\(\theta_1\)</span> term is squared, and the <span class="math inline">\(\theta_2\)</span> term is logged. We cannot separate it into a matrix product of two terms.</li>
+</ul>
+</div>
+</div>
+</section>
+<section id="mean-squared-error" class="level3" data-number="12.1.3">
+<h3 data-number="12.1.3" class="anchored" data-anchor-id="mean-squared-error"><span class="header-section-number">12.1.3</span> Mean Squared Error</h3>
+<p>We now have a new approach to understanding models in terms of vectors and matrices. To accompany this new convention, we should update our understanding of risk functions and model fitting.</p>
+<p>Recall our definition of MSE: <span class="math display">\[R(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2\]</span></p>
+<p>At its heart, the MSE is a measure of <em>distance</em> – it gives an indication of how “far away” the predictions are from the true values, on average.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled" data-collaps="false">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Linear Algebra: L2 Norm
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>When working with vectors, this idea of “distance” or the vector’s <strong>size/length</strong> is represented by the <strong>norm</strong>. More precisely, the distance between two vectors <span class="math inline">\(\vec{a}\)</span> and <span class="math inline">\(\vec{b}\)</span> can be expressed as: <span class="math display">\[||\vec{a} - \vec{b}||_2 = \sqrt{(a_1 - b_1)^2 + (a_2 - b_2)^2 + \ldots + (a_n - b_n)^2} = \sqrt{\sum_{i=1}^n (a_i - b_i)^2}\]</span></p>
+<p>The double bars are mathematical notation for the norm. The subscript 2 indicates that we are computing the L2, or squared norm.</p>
+<p>The two norms we need to know for Data 100 are the L1 and L2 norms (sound familiar?). In this note, we’ll focus on L2 norm. We’ll dive into L1 norm in future lectures.</p>
+<p>For the n-dimensional vector <span class="math display">\[\vec{x} = \begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_n \end{bmatrix}\]</span> its <strong>L2 vector norm</strong> is</p>
+<p><span class="math display">\[||\vec{x}||_2 = \sqrt{(x_1)^2 + (x_2)^2 + \ldots + (x_n)^2} = \sqrt{\sum_{i=1}^n (x_i)^2}\]</span></p>
+<p>The L2 vector norm is a generalization of the Pythagorean theorem in <span class="math inline">\(n\)</span> dimensions. Thus, it can be used as a measure of the <strong>length</strong> of a vector or even as a measure of the <strong>distance</strong> between two vectors.</p>
+</div>
+</div>
+<p>We can express the MSE as a squared L2 norm if we rewrite it in terms of the prediction vector, <span class="math inline">\(\hat{\mathbb{Y}}\)</span>, and true target vector, <span class="math inline">\(\mathbb{Y}\)</span>:</p>
+<p><span class="math display">\[R(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2 = \frac{1}{n} (||\mathbb{Y} - \hat{\mathbb{Y}}||_2)^2\]</span></p>
+<p>Here, the superscript 2 outside of the parentheses means that we are <em>squaring</em> the norm. If we plug in our linear model <span class="math inline">\(\hat{\mathbb{Y}} = \mathbb{X} \theta\)</span>, we find the MSE cost function in vector notation:</p>
+<p><span class="math display">\[R(\theta) = \frac{1}{n} (||\mathbb{Y} - \mathbb{X} \theta||_2)^2\]</span></p>
+<p>Under the linear algebra perspective, our new task is to fit the optimal parameter vector <span class="math inline">\(\theta\)</span> such that the cost function is minimized. Equivalently, we wish to minimize the norm <span class="math display">\[||\mathbb{Y} - \mathbb{X} \theta||_2 = ||\mathbb{Y} - \hat{\mathbb{Y}}||_2.\]</span></p>
+<p>We can restate this goal in two ways:</p>
+<ul>
+<li>Minimize the <strong>distance</strong> between the vector of true values, <span class="math inline">\(\mathbb{Y}\)</span>, and the vector of predicted values, <span class="math inline">\(\mathbb{\hat{Y}}\)</span></li>
+<li>Minimize the <strong>length</strong> of the <strong>residual vector</strong>, defined as: <span class="math display">\[e = \mathbb{Y} - \mathbb{\hat{Y}} = \begin{bmatrix}
+         y_1 - \hat{y}_1 \\
+         y_2 - \hat{y}_2 \\
+         \vdots \\
+         y_n - \hat{y}_n
+       \end{bmatrix}\]</span></li>
+</ul>
+</section>
+<section id="a-note-on-terminology-for-multiple-linear-regression" class="level3" data-number="12.1.4">
+<h3 data-number="12.1.4" class="anchored" data-anchor-id="a-note-on-terminology-for-multiple-linear-regression"><span class="header-section-number">12.1.4</span> A Note on Terminology for Multiple Linear Regression</h3>
+<p>There are several equivalent terms in the context of regression. The ones we use most often for this course are bolded.</p>
+<ul>
+<li><span class="math inline">\(x\)</span> can be called a
+<ul>
+<li><strong>Feature(s)</strong></li>
+<li>Covariate(s)</li>
+<li><strong>Independent variable(s)</strong></li>
+<li>Explanatory variable(s)</li>
+<li>Predictor(s)</li>
+<li>Input(s)</li>
+<li>Regressor(s)</li>
+</ul></li>
+<li><span class="math inline">\(y\)</span> can be called an
+<ul>
+<li><strong>Output</strong></li>
+<li>Outcome</li>
+<li><strong>Response</strong></li>
+<li>Dependent variable</li>
+</ul></li>
+<li><span class="math inline">\(\hat{y}\)</span> can be called a
+<ul>
+<li><strong>Prediction</strong></li>
+<li>Predicted response</li>
+<li>Estimated value</li>
+</ul></li>
+<li><span class="math inline">\(\theta\)</span> can be called a
+<ul>
+<li><strong>Weight(s)</strong></li>
+<li><strong>Parameter(s)</strong></li>
+<li>Coefficient(s)</li>
+</ul></li>
+<li><span class="math inline">\(\hat{\theta}\)</span> can be called a
+<ul>
+<li><strong>Estimator(s)</strong></li>
+<li><strong>Optimal parameter(s)</strong></li>
+</ul></li>
+<li>A datapoint <span class="math inline">\((x, y)\)</span> is also called an observation.</li>
+</ul>
+</section>
+</section>
+<section id="geometric-derivation" class="level2" data-number="12.2">
+<h2 data-number="12.2" class="anchored" data-anchor-id="geometric-derivation"><span class="header-section-number">12.2</span> Geometric Derivation</h2>
+<div class="callout callout-style-default callout-tip no-icon callout-titled" data-collaps="false">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Linear Algebra: Span
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Recall that the <strong>span</strong> or <strong>column space</strong> of a matrix <span class="math inline">\(\mathbb{X}\)</span> (denoted <span class="math inline">\(span(\mathbb{X})\)</span>) is the set of all possible linear combinations of the matrix’s columns. In other words, the span represents every point in space that could possibly be reached by adding and scaling some combination of the matrix columns. Additionally, if each column of <span class="math inline">\(\mathbb{X}\)</span> has length <span class="math inline">\(n\)</span>, <span class="math inline">\(span(\mathbb{X})\)</span> is a subspace of <span class="math inline">\(\mathbb{R}^{n}\)</span>.</p>
+</div>
+</div>
+<div class="callout callout-style-default callout-tip no-icon callout-titled" data-collaps="false">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Linear Algebra: Matrix-Vector Multiplication
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>There are 2 ways we can think about matrix-vector multiplication</p>
+<ol type="1">
+<li>So far, we’ve thought of our model as horizontally stacked predictions per datapoint
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/matmul1.png" alt="row_col" width="300">
+</td>
+</tr>
+</tbody></table>
+</div></li>
+<li>However, it is helpful sometimes to think of matrix-vector multiplication as performed by columns. We can also think of <span class="math inline">\(\mathbb{Y}\)</span> as a <em>linear combination of feature vectors</em>, scaled by <em>parameters</em>.
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/matmul2.png" alt="row_col" width="500">
+</td>
+</tr>
+</tbody></table>
+</div></li>
+</ol>
+</div>
+</div>
+<p>Up until now, we’ve mostly thought of our model as a scalar product between horizontally stacked observations and the parameter vector. We can also think of <span class="math inline">\(\hat{\mathbb{Y}}\)</span> as a <strong>linear combination of feature vectors</strong>, scaled by the <strong>parameters</strong>. We use the notation <span class="math inline">\(\mathbb{X}_{:, i}\)</span> to denote the <span class="math inline">\(i\)</span>th column of the design matrix. You can think of this as following the same convention as used when calling <code>.iloc</code> and <code>.loc</code>. “:” means that we are taking all entries in the <span class="math inline">\(i\)</span>th column.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/columns.png" alt="columns" width="500">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p><span class="math display">\[
+\hat{\mathbb{Y}} =
+\theta_0 \begin{bmatrix}
+           1 \\
+           1 \\
+           \vdots \\
+           1
+         \end{bmatrix} + \theta_1 \begin{bmatrix}
+           x_{11} \\
+           x_{21} \\
+           \vdots \\
+           x_{n1}
+         \end{bmatrix} + \ldots + \theta_p \begin{bmatrix}
+           x_{1p} \\
+           x_{2p} \\
+           \vdots \\
+           x_{np}
+         \end{bmatrix}
+         = \theta_0 \mathbb{X}_{:,\:1} + \theta_1 \mathbb{X}_{:,\:2} + \ldots + \theta_p \mathbb{X}_{:,\:p+1}\]</span></p>
+<p>This new approach is useful because it allows us to take advantage of the properties of linear combinations.</p>
+<p>Because the prediction vector, <span class="math inline">\(\hat{\mathbb{Y}} = \mathbb{X} \theta\)</span>, is a <strong>linear combination</strong> of the columns of <span class="math inline">\(\mathbb{X}\)</span>, we know that the <strong>predictions are contained in the span of <span class="math inline">\(\mathbb{X}\)</span></strong>. That is, we know that <span class="math inline">\(\mathbb{\hat{Y}} \in \text{Span}(\mathbb{X})\)</span>.</p>
+<p>The diagram below is a simplified view of <span class="math inline">\(\text{Span}(\mathbb{X})\)</span>, assuming that each column of <span class="math inline">\(\mathbb{X}\)</span> has length <span class="math inline">\(n\)</span>. Notice that the columns of <span class="math inline">\(\mathbb{X}\)</span> define a subspace of <span class="math inline">\(\mathbb{R}^n\)</span>, where each point in the subspace can be reached by a linear combination of <span class="math inline">\(\mathbb{X}\)</span>’s columns. The prediction vector <span class="math inline">\(\mathbb{\hat{Y}}\)</span> lies somewhere in this subspace.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/span.png" alt="span" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>Examining this diagram, we find a problem. The vector of true values, <span class="math inline">\(\mathbb{Y}\)</span>, could theoretically lie <em>anywhere</em> in <span class="math inline">\(\mathbb{R}^n\)</span> space – its exact location depends on the data we collect out in the real world. However, our multiple linear regression model can only make predictions in the subspace of <span class="math inline">\(\mathbb{R}^n\)</span> spanned by <span class="math inline">\(\mathbb{X}\)</span>. Remember the model fitting goal we established in the previous section: we want to generate predictions such that the distance between the vector of true values, <span class="math inline">\(\mathbb{Y}\)</span>, and the vector of predicted values, <span class="math inline">\(\mathbb{\hat{Y}}\)</span>, is minimized. This means that <strong>we want <span class="math inline">\(\mathbb{\hat{Y}}\)</span> to be the vector in <span class="math inline">\(\text{Span}(\mathbb{X})\)</span> that is closest to <span class="math inline">\(\mathbb{Y}\)</span></strong>.</p>
+<p>Another way of rephrasing this goal is to say that we wish to minimize the length of the residual vector <span class="math inline">\(e\)</span>, as measured by its <span class="math inline">\(L_2\)</span> norm.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/residual.png" alt="residual" width="600">
+</td>
+</tr>
+</tbody></table>
+</div>
+<p>The vector in <span class="math inline">\(\text{Span}(\mathbb{X})\)</span> that is closest to <span class="math inline">\(\mathbb{Y}\)</span> is always the <strong>orthogonal projection</strong> of <span class="math inline">\(\mathbb{Y}\)</span> onto <span class="math inline">\(\text{Span}(\mathbb{X}).\)</span> Thus, we should choose the parameter vector <span class="math inline">\(\theta\)</span> that makes the <strong>residual vector orthogonal to any vector in <span class="math inline">\(\text{Span}(\mathbb{X})\)</span></strong>. You can visualize this as the vector created by dropping a perpendicular line from <span class="math inline">\(\mathbb{Y}\)</span> onto the span of <span class="math inline">\(\mathbb{X}\)</span>.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled" data-collaps="false">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Linear Algebra: Orthogonality
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Recall that two vectors <span class="math inline">\(\vec{a}\)</span> and <span class="math inline">\(\vec{b}\)</span> are orthogonal if their dot product is zero: <span class="math inline">\(\vec{a}^{T}\vec{b} = 0\)</span>.</p>
+<p>A vector <span class="math inline">\(v\)</span> is <strong>orthogonal</strong> to the span of a matrix <span class="math inline">\(M\)</span> if and only if <span class="math inline">\(v\)</span> is orthogonal to <strong>each column</strong> in <span class="math inline">\(M\)</span>. Put together, a vector <span class="math inline">\(v\)</span> is orthogonal to <span class="math inline">\(\text{Span}(M)\)</span> if:</p>
+<p><span class="math display">\[M^Tv = \vec{0}\]</span></p>
+<p>Note that <span class="math inline">\(\vec{0}\)</span> represents the <strong>zero vector</strong>, a <span class="math inline">\(d\)</span>-length vector full of 0s.</p>
+</div>
+</div>
+<p>Remember our goal is to find <span class="math inline">\(\hat{\theta}\)</span> such that we minimize the objective function <span class="math inline">\(R(\theta)\)</span>. Equivalently, this is the <span class="math inline">\(\hat{\theta}\)</span> such that the residual vector <span class="math inline">\(e = \mathbb{Y} - \mathbb{X} \hat{\theta}\)</span> is orthogonal to <span class="math inline">\(\text{Span}(\mathbb{X})\)</span>.</p>
+<p>Looking at the definition of orthogonality of <span class="math inline">\(\mathbb{Y} - \mathbb{X}\hat{\theta}\)</span> to <span class="math inline">\(span(\mathbb{X})\)</span>, we can write: <span class="math display">\[\mathbb{X}^T (\mathbb{Y} - \mathbb{X}\hat{\theta}) = \vec{0}\]</span></p>
+<p>Let’s then rearrange the terms: <span class="math display">\[\mathbb{X}^T \mathbb{Y} - \mathbb{X}^T \mathbb{X} \hat{\theta} = \vec{0}\]</span></p>
+<p>And finally, we end up with the <strong>normal equation</strong>: <span class="math display">\[\mathbb{X}^T \mathbb{X} \hat{\theta} = \mathbb{X}^T \mathbb{Y}\]</span></p>
+<p>Any vector <span class="math inline">\(\theta\)</span> that minimizes MSE on a dataset must satisfy this equation.</p>
+<p>If <span class="math inline">\(\mathbb{X}^T \mathbb{X}\)</span> is invertible, we can conclude: <span class="math display">\[\hat{\theta} = (\mathbb{X}^T \mathbb{X})^{-1} \mathbb{X}^T \mathbb{Y}\]</span></p>
+<p>This is called the <strong>least squares estimate</strong> of <span class="math inline">\(\theta\)</span>: it is the value of <span class="math inline">\(\theta\)</span> that minimizes the squared loss.</p>
+<p>Note that the least squares estimate was derived under the assumption that <span class="math inline">\(\mathbb{X}^T \mathbb{X}\)</span> is <em>invertible</em>. This condition holds true when <span class="math inline">\(\mathbb{X}^T \mathbb{X}\)</span> is full column rank, which, in turn, happens when <span class="math inline">\(\mathbb{X}\)</span> is full column rank. The proof for why <span class="math inline">\(\mathbb{X}\)</span> needs to be full column rank is optional and in the Bonus section at the end.</p>
+</section>
+<section id="evaluating-model-performance" class="level2" data-number="12.3">
+<h2 data-number="12.3" class="anchored" data-anchor-id="evaluating-model-performance"><span class="header-section-number">12.3</span> Evaluating Model Performance</h2>
+<p>Our geometric view of multiple linear regression has taken us far! We have identified the optimal set of parameter values to minimize MSE in a model of multiple features. Now, we want to understand how well our fitted model performs.</p>
+<section id="rmse" class="level3" data-number="12.3.1">
+<h3 data-number="12.3.1" class="anchored" data-anchor-id="rmse"><span class="header-section-number">12.3.1</span> RMSE</h3>
+<p>One measure of model performance is the <strong>Root Mean Squared Error</strong>, or RMSE. The RMSE is simply the square root of MSE. Taking the square root converts the value back into the original, non-squared units of <span class="math inline">\(y_i\)</span>, which is useful for understanding the model’s performance. A low RMSE indicates more “accurate” predictions – that there is a lower average loss across the dataset.</p>
+<p><span class="math display">\[\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2}\]</span></p>
+</section>
+<section id="residual-plots" class="level3" data-number="12.3.2">
+<h3 data-number="12.3.2" class="anchored" data-anchor-id="residual-plots"><span class="header-section-number">12.3.2</span> Residual Plots</h3>
+<p>When working with SLR, we generated plots of the residuals against a single feature to understand the behavior of residuals. When working with several features in multiple linear regression, it no longer makes sense to consider a single feature in our residual plots. Instead, multiple linear regression is evaluated by making plots of the residuals against the predicted values. As was the case with SLR, a multiple linear model performs well if its residual plot shows no patterns.</p>
+<div data-align="middle">
+<table style="width:100%">
+<tbody><tr align="center">
+<td>
+<img src="images/residual_plot.png" alt="residual_plot" width="500">
+</td>
+</tr>
+</tbody></table>
+</div>
+</section>
+<section id="multiple-r2" class="level3" data-number="12.3.3">
+<h3 data-number="12.3.3" class="anchored" data-anchor-id="multiple-r2"><span class="header-section-number">12.3.3</span> Multiple <span class="math inline">\(R^2\)</span></h3>
+<p>For SLR, we used the correlation coefficient to capture the association between the target variable and a single feature variable. In a multiple linear model setting, we will need a performance metric that can account for multiple features at once. <strong>Multiple <span class="math inline">\(R^2\)</span></strong>, also called the <strong>coefficient of determination</strong>, is the <strong>proportion of variance</strong> of our <strong>fitted values</strong> (predictions) <span class="math inline">\(\hat{y}_i\)</span> to our true values <span class="math inline">\(y_i\)</span>. It ranges from 0 to 1 and is effectively the <em>proportion</em> of variance in the observations that the <strong>model explains</strong>.</p>
+<p><span class="math display">\[R^2 = \frac{\text{variance of } \hat{y}_i}{\text{variance of } y_i} = \frac{\sigma^2_{\hat{y}}}{\sigma^2_y}\]</span></p>
+<p>Note that for OLS with an intercept term, for example <span class="math inline">\(\hat{y} = \theta_0 + \theta_1x_1 + \theta_2x_2 + \cdots + \theta_px_p\)</span>, <span class="math inline">\(R^2\)</span> is equal to the square of the correlation between <span class="math inline">\(y\)</span> and <span class="math inline">\(\hat{y}\)</span>. On the other hand for SLR, <span class="math inline">\(R^2\)</span> is equal to <span class="math inline">\(r^2\)</span>, the correlation between <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span>. The proof of these last two properties is out of scope for this course.</p>
+<p>Additionally, as we add more features, our fitted values tend to become closer and closer to our actual values. Thus, <span class="math inline">\(R^2\)</span> increases.</p>
+<p>Adding more features doesn’t always mean our model is better though! We’ll see why later in the course.</p>
+</section>
+</section>
+<section id="ols-properties" class="level2" data-number="12.4">
+<h2 data-number="12.4" class="anchored" data-anchor-id="ols-properties"><span class="header-section-number">12.4</span> OLS Properties</h2>
+<ol type="1">
+<li>When using the optimal parameter vector, our residuals <span class="math inline">\(e = \mathbb{Y} - \hat{\mathbb{Y}}\)</span> are orthogonal to <span class="math inline">\(span(\mathbb{X})\)</span>.</li>
+</ol>
+<p><span class="math display">\[\mathbb{X}^Te = 0 \]</span></p>
+<div class="callout callout-style-simple callout-none no-icon">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Proof:</p>
+<ul>
+<li>The optimal parameter vector, <span class="math inline">\(\hat{\theta}\)</span>, solves the normal equations <span class="math inline">\(\implies \hat{\theta} = (\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y}\)</span></li>
+</ul>
+<p><span class="math display">\[\mathbb{X}^Te = \mathbb{X}^T (\mathbb{Y} - \mathbb{\hat{Y}}) \]</span></p>
+<p><span class="math display">\[\mathbb{X}^T (\mathbb{Y} - \mathbb{X}\hat{\theta}) = \mathbb{X}^T\mathbb{Y} - \mathbb{X}^T\mathbb{X}\hat{\theta}\]</span></p>
+<ul>
+<li>Any matrix multiplied with its own inverse is the identity matrix <span class="math inline">\(\mathbb{I}\)</span></li>
+</ul>
+<p><span class="math display">\[\mathbb{X}^T\mathbb{Y} - (\mathbb{X}^T\mathbb{X})(\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y} = \mathbb{X}^T\mathbb{Y} - \mathbb{X}^T\mathbb{Y} = 0\]</span></p>
+</div>
+</div>
+</div>
+<ol start="2" type="1">
+<li>For all linear models with an <strong>intercept term</strong>, the <strong>sum of residuals is zero</strong>.</li>
+</ol>
+<p><span class="math display">\[\sum_i^n e_i = 0\]</span></p>
+<div class="callout callout-style-simple callout-none no-icon">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Proof:</p>
+<ul>
+<li>For all linear models with an <strong>intercept term</strong>, the average of the predicted <span class="math inline">\(y\)</span> values is equal to the average of the true <span class="math inline">\(y\)</span> values. <span class="math display">\[\bar{y} = \bar{\hat{y}}\]</span></li>
+<li>Rewriting the sum of residuals as two separate sums, <span class="math display">\[\sum_i^n e_i = \sum_i^n y_i - \sum_i^n\hat{y}_i\]</span></li>
+<li>Each respective sum is a multiple of the average of the sum. <span class="math display">\[\sum_i^n e_i = n\bar{y} - n\bar{y} = n(\bar{y} - \bar{y}) = 0\]</span></li>
+</ul>
+</div>
+</div>
+</div>
+<p>To summarize:</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 25%">
+<col style="width: 25%">
+<col style="width: 25%">
+<col style="width: 25%">
+</colgroup>
+<thead>
+<tr class="header">
+<th></th>
+<th>Model</th>
+<th>Estimate</th>
+<th>Unique?</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>Constant Model + MSE</td>
+<td><span class="math inline">\(\hat{y} = \theta_0\)</span></td>
+<td><span class="math inline">\(\hat{\theta}_0 = mean(y) = \bar{y}\)</span></td>
+<td><strong>Yes</strong>. Any set of values has a unique mean.</td>
+</tr>
+<tr class="even">
+<td>Constant Model + MAE</td>
+<td><span class="math inline">\(\hat{y} = \theta_0\)</span></td>
+<td><span class="math inline">\(\hat{\theta}_0 = median(y)\)</span></td>
+<td><strong>Yes</strong>, if odd. <strong>No</strong>, if even. Return the average of the middle 2 values.</td>
+</tr>
+<tr class="odd">
+<td>Simple Linear Regression + MSE</td>
+<td><span class="math inline">\(\hat{y} = \theta_0 + \theta_1x\)</span></td>
+<td><span class="math inline">\(\hat{\theta}_0 = \bar{y} - \hat{\theta}_1\bar{x}\)</span> <span class="math inline">\(\hat{\theta}_1 = r\frac{\sigma_y}{\sigma_x}\)</span></td>
+<td><strong>Yes</strong>. Any set of non-constant* values has a unique mean, SD, and correlation coefficient.</td>
+</tr>
+<tr class="even">
+<td><strong>OLS</strong> (Linear Model + MSE)</td>
+<td><span class="math inline">\(\mathbb{\hat{Y}} = \mathbb{X}\mathbb{\theta}\)</span></td>
+<td><span class="math inline">\(\hat{\theta} = (\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y}\)</span></td>
+<td><strong>Yes</strong>, if <span class="math inline">\(\mathbb{X}\)</span> is full column rank (all columns are linearly independent, # of datapoints &gt;&gt;&gt; # of features).</td>
+</tr>
+</tbody>
+</table>
+</section>
+<section id="bonus-uniqueness-of-the-solution" class="level2" data-number="12.5">
+<h2 data-number="12.5" class="anchored" data-anchor-id="bonus-uniqueness-of-the-solution"><span class="header-section-number">12.5</span> Bonus: Uniqueness of the Solution</h2>
+<p>The Least Squares estimate <span class="math inline">\(\hat{\theta}\)</span> is <strong>unique</strong> if and only if <span class="math inline">\(\mathbb{X}\)</span> is <strong>full column rank</strong>.</p>
+<div class="callout callout-style-simple callout-none no-icon">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Proof:</p>
+<ul>
+<li>We know the solution to the normal equation <span class="math inline">\(\mathbb{X}^T\mathbb{X}\hat{\theta} = \mathbb{X}^T\mathbb{Y}\)</span> is the least square estimate that minimizes the squared loss.</li>
+<li><span class="math inline">\(\hat{\theta}\)</span> has a <strong>unique</strong> solution <span class="math inline">\(\iff\)</span> the square matrix <span class="math inline">\(\mathbb{X}^T\mathbb{X}\)</span> is <strong>invertible</strong> <span class="math inline">\(\iff\)</span> <span class="math inline">\(\mathbb{X}^T\mathbb{X}\)</span> is full rank.
+<ul>
+<li>The <strong>column</strong> rank of a square matrix is the max number of linearly independent columns it contains.</li>
+<li>An <span class="math inline">\(n\)</span> x <span class="math inline">\(n\)</span> square matrix is deemed full column rank when all of its columns are linearly independent. That is, its rank would be equal to <span class="math inline">\(n\)</span>.</li>
+<li><span class="math inline">\(\mathbb{X}^T\mathbb{X}\)</span> has shape <span class="math inline">\((p + 1) \times (p + 1)\)</span>, and therefore has max rank <span class="math inline">\(p + 1\)</span>.</li>
+</ul></li>
+<li><span class="math inline">\(rank(\mathbb{X}^T\mathbb{X})\)</span> = <span class="math inline">\(rank(\mathbb{X})\)</span> (proof out of scope).</li>
+<li>Therefore, <span class="math inline">\(\mathbb{X}^T\mathbb{X}\)</span> has rank <span class="math inline">\(p + 1\)</span> <span class="math inline">\(\iff\)</span> <span class="math inline">\(\mathbb{X}\)</span> has rank <span class="math inline">\(p + 1\)</span> <span class="math inline">\(\iff \mathbb{X}\)</span> is full column rank.</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Therefore, if <span class="math inline">\(\mathbb{X}\)</span> is not full column rank, we will not have unique estimates. This can happen for two major reasons.</p>
+<ol type="1">
+<li>If our design matrix <span class="math inline">\(\mathbb{X}\)</span> is “<strong>wide</strong>”:
+<ul>
+<li>If n &lt; p, then we have way more features (columns) than observations (rows).</li>
+<li>Then <span class="math inline">\(rank(\mathbb{X})\)</span> = min(n, p+1) &lt; p+1, so <span class="math inline">\(\hat{\theta}\)</span> is not unique.</li>
+<li>Typically we have n &gt;&gt; p so this is less of an issue.</li>
+</ul></li>
+<li>If our design matrix <span class="math inline">\(\mathbb{X}\)</span> has features that are <strong>linear combinations</strong> of other features:
+<ul>
+<li>By definition, rank of <span class="math inline">\(\mathbb{X}\)</span> is number of linearly independent columns in <span class="math inline">\(\mathbb{X}\)</span>.</li>
+<li>Example: If “Width”, “Height”, and “Perimeter” are all columns,
+<ul>
+<li>Perimeter = 2 * Width + 2 * Height <span class="math inline">\(\rightarrow\)</span> <span class="math inline">\(\mathbb{X}\)</span> is not full rank.</li>
+</ul></li>
+<li>Important with one-hot encoding (to discuss later).</li>
+</ul></li>
+</ol>
+
+
+<!-- -->
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../constant_model_loss_transformations/loss_transformations.html" class="pagination-link" aria-label="Constant Model, Loss, and Transformations">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../gradient_descent/gradient_descent.html" class="pagination-link" aria-label="sklearn and Gradient Descent">
+        <span class="nav-page-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb4" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Ordinary Least Squares</span></span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Ordinary Least Squares</span></span>
+<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span><span class="co"> python3</span></span>
+<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Define linearity with respect to a vector of parameters $\theta$.</span>
+<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Understand the use of matrix notation to express multiple linear regression.</span>
+<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Interpret ordinary least squares as the minimization of the norm of the residual vector.</span>
+<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Compute performance metrics for multiple linear regression.</span>
+<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>We've now spent a number of lectures exploring how to build effective models – we introduced the SLR and constant models, selected cost functions to suit our modeling task, and applied transformations to improve the linear fit.</span>
+<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>Throughout all of this, we considered models of one feature ($\hat{y}_i = \theta_0 + \theta_1 x_i$) or zero features ($\hat{y}_i = \theta_0$). As data scientists, we usually have access to datasets containing *many* features. To make the best models we can, it will be beneficial to consider all of the variables available to us as inputs to a model, rather than just one. In today's lecture, we'll introduce **multiple linear regression** as a framework to incorporate multiple features into a model. We will also learn how to accelerate the modeling process – specifically, we'll see how linear algebra offers us a powerful set of tools for understanding model performance.</span>
+<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a><span class="fu">## OLS Problem Formulation</span></span>
+<span id="cb4-33"><a href="#cb4-33" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-34"><a href="#cb4-34" aria-hidden="true" tabindex="-1"></a><span class="fu">### Multiple Linear Regression</span></span>
+<span id="cb4-35"><a href="#cb4-35" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-36"><a href="#cb4-36" aria-hidden="true" tabindex="-1"></a>Multiple linear regression is an extension of simple linear regression that adds additional features to the model. The multiple linear regression model takes the form:</span>
+<span id="cb4-37"><a href="#cb4-37" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-38"><a href="#cb4-38" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:...\:+\:\theta_p x_{p}$$</span>
+<span id="cb4-39"><a href="#cb4-39" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-40"><a href="#cb4-40" aria-hidden="true" tabindex="-1"></a>Our predicted value of $y$, $\hat{y}$, is a linear combination of the single **observations** (features), $x_i$, and the parameters, $\theta_i$. </span>
+<span id="cb4-41"><a href="#cb4-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a>We can explore this idea further by looking at a dataset containing aggregate per-player data from the 2018-19 NBA season, downloaded from <span class="co">[</span><span class="ot">Kaggle</span><span class="co">](https://www.kaggle.com/schmadam97/nba-regular-season-stats-20182019)</span>.</span>
+<span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-46"><a href="#cb4-46" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb4-47"><a href="#cb4-47" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb4-48"><a href="#cb4-48" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb4-49"><a href="#cb4-49" aria-hidden="true" tabindex="-1"></a>nba <span class="op">=</span> pd.read_csv(<span class="st">'data/nba18-19.csv'</span>, index_col<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb4-50"><a href="#cb4-50" aria-hidden="true" tabindex="-1"></a>nba.index.name <span class="op">=</span> <span class="va">None</span> <span class="co"># Drops name of index (players are ordered by rank)</span></span>
+<span id="cb4-51"><a href="#cb4-51" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb4-52"><a href="#cb4-52" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-55"><a href="#cb4-55" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb4-56"><a href="#cb4-56" aria-hidden="true" tabindex="-1"></a>nba.head(<span class="dv">5</span>)</span>
+<span id="cb4-57"><a href="#cb4-57" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb4-58"><a href="#cb4-58" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-59"><a href="#cb4-59" aria-hidden="true" tabindex="-1"></a>Let's say we are interested in predicting the number of points (<span class="in">`PTS`</span>) an athlete will score in a basketball game this season.</span>
+<span id="cb4-60"><a href="#cb4-60" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-61"><a href="#cb4-61" aria-hidden="true" tabindex="-1"></a>Suppose we want to fit a linear model by using some characteristics, or **features** of a player. Specifically, we'll focus on field goals, assists, and 3-point attempts.</span>
+<span id="cb4-62"><a href="#cb4-62" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-63"><a href="#cb4-63" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`FG`</span>, the average number of (2-point) field goals per game</span>
+<span id="cb4-64"><a href="#cb4-64" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`AST`</span>, the average number of assists per game</span>
+<span id="cb4-65"><a href="#cb4-65" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`3PA`</span>, the average number of 3-point field goals attempted per game</span>
+<span id="cb4-66"><a href="#cb4-66" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-69"><a href="#cb4-69" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb4-70"><a href="#cb4-70" aria-hidden="true" tabindex="-1"></a>nba[[<span class="st">'FG'</span>, <span class="st">'AST'</span>, <span class="st">'3PA'</span>, <span class="st">'PTS'</span>]].head()</span>
+<span id="cb4-71"><a href="#cb4-71" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb4-72"><a href="#cb4-72" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-73"><a href="#cb4-73" aria-hidden="true" tabindex="-1"></a>Because we are now dealing with many parameter values, we've collected them all into a **parameter vector** with dimensions $(p+1) \times 1$ to keep things tidy. Remember that $p$ represents the number of features we have (in this case, 3).</span>
+<span id="cb4-74"><a href="#cb4-74" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-75"><a href="#cb4-75" aria-hidden="true" tabindex="-1"></a>$$\theta = \begin{bmatrix}</span>
+<span id="cb4-76"><a href="#cb4-76" aria-hidden="true" tabindex="-1"></a>           \theta_{0} <span class="sc">\\</span></span>
+<span id="cb4-77"><a href="#cb4-77" aria-hidden="true" tabindex="-1"></a>           \theta_{1} <span class="sc">\\</span></span>
+<span id="cb4-78"><a href="#cb4-78" aria-hidden="true" tabindex="-1"></a>           \vdots <span class="sc">\\</span></span>
+<span id="cb4-79"><a href="#cb4-79" aria-hidden="true" tabindex="-1"></a>           \theta_{p}</span>
+<span id="cb4-80"><a href="#cb4-80" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix}$$</span>
+<span id="cb4-81"><a href="#cb4-81" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-82"><a href="#cb4-82" aria-hidden="true" tabindex="-1"></a>We are working with two vectors here: a row vector representing the observed data, and a column vector containing the model parameters. The multiple linear regression model is **equivalent to the dot (scalar) product of the observation vector and parameter vector**. </span>
+<span id="cb4-83"><a href="#cb4-83" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-84"><a href="#cb4-84" aria-hidden="true" tabindex="-1"></a>$$<span class="co">[</span><span class="ot">1,\:x_{1},\:x_{2},\:x_{3},\:...,\:x_{p}</span><span class="co">]</span> \theta = <span class="co">[</span><span class="ot">1,\:x_{1},\:x_{2},\:x_{3},\:...,\:x_{p}</span><span class="co">]</span> \begin{bmatrix}</span>
+<span id="cb4-85"><a href="#cb4-85" aria-hidden="true" tabindex="-1"></a>           \theta_{0} <span class="sc">\\</span></span>
+<span id="cb4-86"><a href="#cb4-86" aria-hidden="true" tabindex="-1"></a>           \theta_{1} <span class="sc">\\</span></span>
+<span id="cb4-87"><a href="#cb4-87" aria-hidden="true" tabindex="-1"></a>           \vdots <span class="sc">\\</span></span>
+<span id="cb4-88"><a href="#cb4-88" aria-hidden="true" tabindex="-1"></a>           \theta_{p}</span>
+<span id="cb4-89"><a href="#cb4-89" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:...\:+\:\theta_p x_{p}$$</span>
+<span id="cb4-90"><a href="#cb4-90" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-91"><a href="#cb4-91" aria-hidden="true" tabindex="-1"></a>Notice that we have inserted 1 as the first value in the observation vector. When the dot product is computed, this 1 will be multiplied with $\theta_0$ to give the intercept of the regression model. We call this 1 entry the **intercept** or **bias** term.</span>
+<span id="cb4-92"><a href="#cb4-92" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-93"><a href="#cb4-93" aria-hidden="true" tabindex="-1"></a>Given that we have three features here, we can express this model as:</span>
+<span id="cb4-94"><a href="#cb4-94" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:\theta_3 x_{3}$$</span>
+<span id="cb4-95"><a href="#cb4-95" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-96"><a href="#cb4-96" aria-hidden="true" tabindex="-1"></a>Our features are represented by $x_1$ (<span class="in">`FG`</span>), $x_2$ (<span class="in">`AST`</span>), and $x_3$ (<span class="in">`3PA`</span>) with each having correpsonding parameters, $\theta_1$, $\theta_2$, and  $\theta_3$.</span>
+<span id="cb4-97"><a href="#cb4-97" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-98"><a href="#cb4-98" aria-hidden="true" tabindex="-1"></a>In statistics, this model + loss is called **Ordinary Least Squares (OLS)**. The solution to OLS is the minimizing loss for parameters $\hat{\theta}$, also called the **least squares estimate**.</span>
+<span id="cb4-99"><a href="#cb4-99" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-100"><a href="#cb4-100" aria-hidden="true" tabindex="-1"></a><span class="fu">### Linear Algebra Approach</span></span>
+<span id="cb4-101"><a href="#cb4-101" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-102"><a href="#cb4-102" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collapse="false"}</span>
+<span id="cb4-103"><a href="#cb4-103" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Linear Algebra Review: Vector Dot Product</span></span>
+<span id="cb4-104"><a href="#cb4-104" aria-hidden="true" tabindex="-1"></a>The **dot product (or inner product)** is a vector operation that:</span>
+<span id="cb4-105"><a href="#cb4-105" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-106"><a href="#cb4-106" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Can only be carried out on two vectors of the **same length**</span>
+<span id="cb4-107"><a href="#cb4-107" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Sums up the products of the corresponding entries of the two vectors</span>
+<span id="cb4-108"><a href="#cb4-108" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Returns a single number</span>
+<span id="cb4-109"><a href="#cb4-109" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-110"><a href="#cb4-110" aria-hidden="true" tabindex="-1"></a>For example, let </span>
+<span id="cb4-111"><a href="#cb4-111" aria-hidden="true" tabindex="-1"></a>$$ </span>
+<span id="cb4-112"><a href="#cb4-112" aria-hidden="true" tabindex="-1"></a>\begin{align}</span>
+<span id="cb4-113"><a href="#cb4-113" aria-hidden="true" tabindex="-1"></a>\vec{u} = \begin{bmatrix}1 <span class="sc">\\</span> 2 <span class="sc">\\</span> 3\end{bmatrix}, \vec{v} = \begin{bmatrix}1 <span class="sc">\\</span> 1 <span class="sc">\\</span> 1\end{bmatrix}</span>
+<span id="cb4-114"><a href="#cb4-114" aria-hidden="true" tabindex="-1"></a>\end{align}</span>
+<span id="cb4-115"><a href="#cb4-115" aria-hidden="true" tabindex="-1"></a>$$ </span>
+<span id="cb4-116"><a href="#cb4-116" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-117"><a href="#cb4-117" aria-hidden="true" tabindex="-1"></a>The dot product between $\vec{u}$ and $\vec{v}$ is </span>
+<span id="cb4-118"><a href="#cb4-118" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb4-119"><a href="#cb4-119" aria-hidden="true" tabindex="-1"></a>\begin{align}</span>
+<span id="cb4-120"><a href="#cb4-120" aria-hidden="true" tabindex="-1"></a>\vec{u} \cdot \vec{v} &amp;= \vec{u}^T \vec{v} = \vec{v}^T \vec{u} <span class="sc">\\</span></span>
+<span id="cb4-121"><a href="#cb4-121" aria-hidden="true" tabindex="-1"></a>  &amp;= 1 \cdot 1 + 2 \cdot 1 + 3 \cdot 1 <span class="sc">\\</span> </span>
+<span id="cb4-122"><a href="#cb4-122" aria-hidden="true" tabindex="-1"></a>  &amp;= 6</span>
+<span id="cb4-123"><a href="#cb4-123" aria-hidden="true" tabindex="-1"></a>\end{align}</span>
+<span id="cb4-124"><a href="#cb4-124" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb4-125"><a href="#cb4-125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-126"><a href="#cb4-126" aria-hidden="true" tabindex="-1"></a>While not in scope, note that we can also interpret the dot product geometrically:</span>
+<span id="cb4-127"><a href="#cb4-127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-128"><a href="#cb4-128" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>It is the product of three things: the **magnitude** of both vectors, and the **cosine** of the angles between them: $$\vec{u} \cdot \vec{v} = ||\vec{u}|| \cdot ||\vec{v}|| \cdot {cos \theta}$$</span>
+<span id="cb4-129"><a href="#cb4-129" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb4-130"><a href="#cb4-130" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-131"><a href="#cb4-131" aria-hidden="true" tabindex="-1"></a>We now know how to generate a single prediction from multiple observed features. Data scientists usually work at scale – that is, they want to build models that can produce many predictions, all at once. The vector notation we introduced above gives us a hint on how we can expedite multiple linear regression. We want to use the tools of linear algebra.</span>
+<span id="cb4-132"><a href="#cb4-132" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-133"><a href="#cb4-133" aria-hidden="true" tabindex="-1"></a>Let's think about how we can apply what we did above. To accommodate for the fact that we're considering several feature variables, we'll adjust our notation slightly. Each observation can now be thought of as a row vector with an entry for each of $p$ features.</span>
+<span id="cb4-134"><a href="#cb4-134" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-135"><a href="#cb4-135" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-136"><a href="#cb4-136" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-137"><a href="#cb4-137" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb4-138"><a href="#cb4-138" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb4-139"><a href="#cb4-139" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb4-140"><a href="#cb4-140" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/observation.png" alt='observation' width='550'&gt;</span>
+<span id="cb4-141"><a href="#cb4-141" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb4-142"><a href="#cb4-142" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb4-143"><a href="#cb4-143" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb4-144"><a href="#cb4-144" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb4-145"><a href="#cb4-145" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-146"><a href="#cb4-146" aria-hidden="true" tabindex="-1"></a>To make a prediction from the *first* observation in the data, we take the dot product of the parameter vector and *first* observation vector. To make a prediction from the *second* observation, we would repeat this process to find the dot product of the parameter vector and the *second* observation vector. If we wanted to find the model predictions for each observation in the dataset, we'd repeat this process for all $n$ observations in the data. </span>
+<span id="cb4-147"><a href="#cb4-147" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-148"><a href="#cb4-148" aria-hidden="true" tabindex="-1"></a>$$\hat{y}_1 = \theta_0 + \theta_1 x_{11} + \theta_2 x_{12} + ... + \theta_p x_{1p} = [1,\:x_{11},\:x_{12},\:x_{13},\:...,\:x_{1p}] \theta$$</span>
+<span id="cb4-149"><a href="#cb4-149" aria-hidden="true" tabindex="-1"></a>$$\hat{y}_2 = \theta_0 + \theta_1 x_{21} + \theta_2 x_{22} + ... + \theta_p x_{2p} = [1,\:x_{21},\:x_{22},\:x_{23},\:...,\:x_{2p}] \theta$$</span>
+<span id="cb4-150"><a href="#cb4-150" aria-hidden="true" tabindex="-1"></a>$$\vdots$$</span>
+<span id="cb4-151"><a href="#cb4-151" aria-hidden="true" tabindex="-1"></a>$$\hat{y}_n = \theta_0 + \theta_1 x_{n1} + \theta_2 x_{n2} + ... + \theta_p x_{np} = [1,\:x_{n1},\:x_{n2},\:x_{n3},\:...,\:x_{np}] \theta$$</span>
+<span id="cb4-152"><a href="#cb4-152" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-153"><a href="#cb4-153" aria-hidden="true" tabindex="-1"></a>Our observed data is represented by $n$ row vectors, each with dimension $(p+1)$. We can collect them all into a single matrix, which we call $\mathbb{X}$.</span>
+<span id="cb4-154"><a href="#cb4-154" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-155"><a href="#cb4-155" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb4-156"><a href="#cb4-156" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb4-157"><a href="#cb4-157" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb4-158"><a href="#cb4-158" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;&lt;img src="images/design_matrix.png" alt='design_matrix' width='400'&gt;</span>
+<span id="cb4-159"><a href="#cb4-159" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb4-160"><a href="#cb4-160" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb4-161"><a href="#cb4-161" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb4-162"><a href="#cb4-162" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb4-163"><a href="#cb4-163" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-164"><a href="#cb4-164" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-165"><a href="#cb4-165" aria-hidden="true" tabindex="-1"></a>The matrix $\mathbb{X}$ is known as the **design matrix**. It contains all observed data for each of our $p$ features, where each **row** corresponds to one **observation**, and each **column** corresponds to a **feature**. It often (but not always) contains an additional column of all ones to represent the **intercept** or **bias column**. </span>
+<span id="cb4-166"><a href="#cb4-166" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-167"><a href="#cb4-167" aria-hidden="true" tabindex="-1"></a>To review what is happening in the design matrix: each row represents a single observation. For example, a student in Data 100. Each column represents a feature. For example, the ages of students in Data 100. This convention allows us to easily transfer our previous work in DataFrames over to this new linear algebra perspective.</span>
+<span id="cb4-168"><a href="#cb4-168" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-169"><a href="#cb4-169" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb4-170"><a href="#cb4-170" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb4-171"><a href="#cb4-171" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb4-172"><a href="#cb4-172" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;</span>
+<span id="cb4-173"><a href="#cb4-173" aria-hidden="true" tabindex="-1"></a>        &lt;img src="images/row_col.png" alt='row_col' width='600'&gt;</span>
+<span id="cb4-174"><a href="#cb4-174" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb4-175"><a href="#cb4-175" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb4-176"><a href="#cb4-176" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb4-177"><a href="#cb4-177" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb4-178"><a href="#cb4-178" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-179"><a href="#cb4-179" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-180"><a href="#cb4-180" aria-hidden="true" tabindex="-1"></a>The multiple linear regression model can then be restated in terms of matrices:</span>
+<span id="cb4-181"><a href="#cb4-181" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb4-182"><a href="#cb4-182" aria-hidden="true" tabindex="-1"></a>\Large</span>
+<span id="cb4-183"><a href="#cb4-183" aria-hidden="true" tabindex="-1"></a>\mathbb{\hat{Y}} = \mathbb{X} \theta</span>
+<span id="cb4-184"><a href="#cb4-184" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb4-185"><a href="#cb4-185" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-186"><a href="#cb4-186" aria-hidden="true" tabindex="-1"></a>Here, $\mathbb{\hat{Y}}$ is the **prediction vector** with $n$ elements ($\mathbb{\hat{Y}} \in \mathbb{R}^{n}$); it contains the prediction made by the model for each of the $n$ input observations. $\mathbb{X}$ is the **design matrix** with dimensions $\mathbb{X} \in \mathbb{R}^{n \times (p + 1)}$, and $\theta$ is the **parameter vector** with dimensions $\theta \in \mathbb{R}^{(p + 1)}$. Note that our **true output** $\mathbb{Y}$ is also a vector with $n$ elements ($\mathbb{Y} \in \mathbb{R}^{n}$).</span>
+<span id="cb4-187"><a href="#cb4-187" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-188"><a href="#cb4-188" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collaps="false"}</span>
+<span id="cb4-189"><a href="#cb4-189" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Linear Algebra Review: Linearity </span></span>
+<span id="cb4-190"><a href="#cb4-190" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-191"><a href="#cb4-191" aria-hidden="true" tabindex="-1"></a>An expression is **linear in $\theta$** (a set of parameters) if it is a linear combination of the elements of the set. Checking if an expression can separate into a matrix product of two terms -- a **vector of $\theta$** s, and a matrix/vector **not involving $\theta$** -- is a good indicator of linearity.</span>
+<span id="cb4-192"><a href="#cb4-192" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-193"><a href="#cb4-193" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-194"><a href="#cb4-194" aria-hidden="true" tabindex="-1"></a>For example, consider the vector $\theta = <span class="co">[</span><span class="ot">\theta_0, \theta_1, \theta_2</span><span class="co">]</span>$</span>
+<span id="cb4-195"><a href="#cb4-195" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-196"><a href="#cb4-196" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\hat{y} = \theta_0 + 2\theta_1 + 3\theta_2$ is linear in theta, and we can separate it into a matrix product of two terms:</span>
+<span id="cb4-197"><a href="#cb4-197" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-198"><a href="#cb4-198" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \begin{bmatrix} 1 \space 2 \space 3 \end{bmatrix} \begin{bmatrix} \theta_0 <span class="sc">\\</span> \theta_1 <span class="sc">\\</span> \theta_2 \end{bmatrix}$$</span>
+<span id="cb4-199"><a href="#cb4-199" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-200"><a href="#cb4-200" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\hat{y} = \theta_0\theta_1 + 2\theta_1^2 + 3log(\theta_2)$ is *not* linear in theta, as the $\theta_1$ term is squared, and the $\theta_2$ term is logged. We cannot separate it into a matrix product of two terms.</span>
+<span id="cb4-201"><a href="#cb4-201" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb4-202"><a href="#cb4-202" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-203"><a href="#cb4-203" aria-hidden="true" tabindex="-1"></a><span class="fu">### Mean Squared Error</span></span>
+<span id="cb4-204"><a href="#cb4-204" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-205"><a href="#cb4-205" aria-hidden="true" tabindex="-1"></a>We now have a new approach to understanding models in terms of vectors and matrices. To accompany this new convention, we should update our understanding of risk functions and model fitting.</span>
+<span id="cb4-206"><a href="#cb4-206" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-207"><a href="#cb4-207" aria-hidden="true" tabindex="-1"></a>Recall our definition of MSE:</span>
+<span id="cb4-208"><a href="#cb4-208" aria-hidden="true" tabindex="-1"></a>$$R(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2$$</span>
+<span id="cb4-209"><a href="#cb4-209" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-210"><a href="#cb4-210" aria-hidden="true" tabindex="-1"></a>At its heart, the MSE is a measure of *distance* – it gives an indication of how "far away" the predictions are from the true values, on average. </span>
+<span id="cb4-211"><a href="#cb4-211" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-212"><a href="#cb4-212" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collaps="false"}</span>
+<span id="cb4-213"><a href="#cb4-213" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Linear Algebra: L2 Norm</span></span>
+<span id="cb4-214"><a href="#cb4-214" aria-hidden="true" tabindex="-1"></a>When working with vectors, this idea of "distance" or the vector's **size/length** is represented by the **norm**. More precisely, the distance between two vectors $\vec{a}$ and $\vec{b}$ can be expressed as:</span>
+<span id="cb4-215"><a href="#cb4-215" aria-hidden="true" tabindex="-1"></a>$$||\vec{a} - \vec{b}||_2 = \sqrt{(a_1 - b_1)^2 + (a_2 - b_2)^2 + \ldots + (a_n - b_n)^2} = \sqrt{\sum_{i=1}^n (a_i - b_i)^2}$$</span>
+<span id="cb4-216"><a href="#cb4-216" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-217"><a href="#cb4-217" aria-hidden="true" tabindex="-1"></a>The double bars are mathematical notation for the norm. The subscript 2 indicates that we are computing the L2, or squared norm.</span>
+<span id="cb4-218"><a href="#cb4-218" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-219"><a href="#cb4-219" aria-hidden="true" tabindex="-1"></a>The two norms we need to know for Data 100 are the L1 and L2 norms (sound familiar?). In this note, we'll focus on L2 norm. We'll dive into L1 norm in future lectures. </span>
+<span id="cb4-220"><a href="#cb4-220" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-221"><a href="#cb4-221" aria-hidden="true" tabindex="-1"></a>For the n-dimensional vector $$\vec{x} = \begin{bmatrix} x_1 <span class="sc">\\</span> x_2 <span class="sc">\\</span> \vdots <span class="sc">\\</span> x_n \end{bmatrix}$$ its **L2 vector norm** is</span>
+<span id="cb4-222"><a href="#cb4-222" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-223"><a href="#cb4-223" aria-hidden="true" tabindex="-1"></a>$$||\vec{x}||_2 = \sqrt{(x_1)^2 + (x_2)^2 + \ldots + (x_n)^2} = \sqrt{\sum_{i=1}^n (x_i)^2}$$</span>
+<span id="cb4-224"><a href="#cb4-224" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-225"><a href="#cb4-225" aria-hidden="true" tabindex="-1"></a>The L2 vector norm is a generalization of the Pythagorean theorem in $n$ dimensions. Thus, it can be used as a measure of the **length** of a vector or even as a measure of the **distance** between two vectors. </span>
+<span id="cb4-226"><a href="#cb4-226" aria-hidden="true" tabindex="-1"></a>::: </span>
+<span id="cb4-227"><a href="#cb4-227" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-228"><a href="#cb4-228" aria-hidden="true" tabindex="-1"></a>We can express the MSE as a squared L2 norm if we rewrite it in terms of the prediction vector, $\hat{\mathbb{Y}}$, and true target vector, $\mathbb{Y}$:</span>
+<span id="cb4-229"><a href="#cb4-229" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-230"><a href="#cb4-230" aria-hidden="true" tabindex="-1"></a>$$R(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2 = \frac{1}{n} (||\mathbb{Y} - \hat{\mathbb{Y}}||_2)^2$$</span>
+<span id="cb4-231"><a href="#cb4-231" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-232"><a href="#cb4-232" aria-hidden="true" tabindex="-1"></a>Here, the superscript 2 outside of the parentheses means that we are *squaring* the norm. If we plug in our linear model $\hat{\mathbb{Y}} = \mathbb{X} \theta$, we find the MSE cost function in vector notation:</span>
+<span id="cb4-233"><a href="#cb4-233" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-234"><a href="#cb4-234" aria-hidden="true" tabindex="-1"></a>$$R(\theta) = \frac{1}{n} (||\mathbb{Y} - \mathbb{X} \theta||_2)^2$$</span>
+<span id="cb4-235"><a href="#cb4-235" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-236"><a href="#cb4-236" aria-hidden="true" tabindex="-1"></a>Under the linear algebra perspective, our new task is to fit the optimal parameter vector $\theta$ such that the cost function is minimized. Equivalently, we wish to minimize the norm $$||\mathbb{Y} - \mathbb{X} \theta||_2 = ||\mathbb{Y} - \hat{\mathbb{Y}}||_2.$$ </span>
+<span id="cb4-237"><a href="#cb4-237" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-238"><a href="#cb4-238" aria-hidden="true" tabindex="-1"></a>We can restate this goal in two ways:</span>
+<span id="cb4-239"><a href="#cb4-239" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-240"><a href="#cb4-240" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Minimize the **distance** between the vector of true values, $\mathbb{Y}$, and the vector of predicted values, $\mathbb{\hat{Y}}$</span>
+<span id="cb4-241"><a href="#cb4-241" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Minimize the **length** of the **residual vector**, defined as:</span>
+<span id="cb4-242"><a href="#cb4-242" aria-hidden="true" tabindex="-1"></a>$$e = \mathbb{Y} - \mathbb{\hat{Y}} = \begin{bmatrix}</span>
+<span id="cb4-243"><a href="#cb4-243" aria-hidden="true" tabindex="-1"></a>           y_1 - \hat{y}_1 <span class="sc">\\</span></span>
+<span id="cb4-244"><a href="#cb4-244" aria-hidden="true" tabindex="-1"></a>           y_2 - \hat{y}_2 <span class="sc">\\</span></span>
+<span id="cb4-245"><a href="#cb4-245" aria-hidden="true" tabindex="-1"></a>           \vdots <span class="sc">\\</span></span>
+<span id="cb4-246"><a href="#cb4-246" aria-hidden="true" tabindex="-1"></a>           y_n - \hat{y}_n</span>
+<span id="cb4-247"><a href="#cb4-247" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix}$$</span>
+<span id="cb4-248"><a href="#cb4-248" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-249"><a href="#cb4-249" aria-hidden="true" tabindex="-1"></a><span class="fu">### A Note on Terminology for Multiple Linear Regression</span></span>
+<span id="cb4-250"><a href="#cb4-250" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-251"><a href="#cb4-251" aria-hidden="true" tabindex="-1"></a>There are several equivalent terms in the context of regression. The ones we use most often for this course are bolded.</span>
+<span id="cb4-252"><a href="#cb4-252" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-253"><a href="#cb4-253" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$x$ can be called a</span>
+<span id="cb4-254"><a href="#cb4-254" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>**Feature(s)**</span>
+<span id="cb4-255"><a href="#cb4-255" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Covariate(s)</span>
+<span id="cb4-256"><a href="#cb4-256" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>**Independent variable(s)**</span>
+<span id="cb4-257"><a href="#cb4-257" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Explanatory variable(s)</span>
+<span id="cb4-258"><a href="#cb4-258" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Predictor(s)</span>
+<span id="cb4-259"><a href="#cb4-259" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Input(s)</span>
+<span id="cb4-260"><a href="#cb4-260" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Regressor(s)</span>
+<span id="cb4-261"><a href="#cb4-261" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$y$ can be called an</span>
+<span id="cb4-262"><a href="#cb4-262" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>**Output**</span>
+<span id="cb4-263"><a href="#cb4-263" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Outcome</span>
+<span id="cb4-264"><a href="#cb4-264" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>**Response**</span>
+<span id="cb4-265"><a href="#cb4-265" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Dependent variable</span>
+<span id="cb4-266"><a href="#cb4-266" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\hat{y}$ can be called a</span>
+<span id="cb4-267"><a href="#cb4-267" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>**Prediction**</span>
+<span id="cb4-268"><a href="#cb4-268" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Predicted response</span>
+<span id="cb4-269"><a href="#cb4-269" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Estimated value</span>
+<span id="cb4-270"><a href="#cb4-270" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\theta$ can be called a</span>
+<span id="cb4-271"><a href="#cb4-271" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>**Weight(s)**</span>
+<span id="cb4-272"><a href="#cb4-272" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>**Parameter(s)**</span>
+<span id="cb4-273"><a href="#cb4-273" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>Coefficient(s)</span>
+<span id="cb4-274"><a href="#cb4-274" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\hat{\theta}$ can be called a</span>
+<span id="cb4-275"><a href="#cb4-275" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>**Estimator(s)**</span>
+<span id="cb4-276"><a href="#cb4-276" aria-hidden="true" tabindex="-1"></a><span class="ss">  - </span>**Optimal parameter(s)**</span>
+<span id="cb4-277"><a href="#cb4-277" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>A datapoint $(x, y)$ is also called an observation.</span>
+<span id="cb4-278"><a href="#cb4-278" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-279"><a href="#cb4-279" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-280"><a href="#cb4-280" aria-hidden="true" tabindex="-1"></a><span class="fu">## Geometric Derivation</span></span>
+<span id="cb4-281"><a href="#cb4-281" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collaps="false"}</span>
+<span id="cb4-282"><a href="#cb4-282" aria-hidden="true" tabindex="-1"></a><span class="fu">### Linear Algebra: Span </span></span>
+<span id="cb4-283"><a href="#cb4-283" aria-hidden="true" tabindex="-1"></a>Recall that the **span** or **column space** of a matrix $\mathbb{X}$ (denoted $span(\mathbb{X})$) is the set of all possible linear combinations of the matrix's columns. In other words, the span represents every point in space that could possibly be reached by adding and scaling some combination of the matrix columns. Additionally, if each column of $\mathbb{X}$ has length $n$, $span(\mathbb{X})$ is a subspace of $\mathbb{R}^{n}$.</span>
+<span id="cb4-284"><a href="#cb4-284" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb4-285"><a href="#cb4-285" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-286"><a href="#cb4-286" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collaps="false"}</span>
+<span id="cb4-287"><a href="#cb4-287" aria-hidden="true" tabindex="-1"></a><span class="fu">### Linear Algebra: Matrix-Vector Multiplication</span></span>
+<span id="cb4-288"><a href="#cb4-288" aria-hidden="true" tabindex="-1"></a>There are 2 ways we can think about matrix-vector multiplication </span>
+<span id="cb4-289"><a href="#cb4-289" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-290"><a href="#cb4-290" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>So far, we’ve thought of our model as horizontally stacked predictions per datapoint</span>
+<span id="cb4-291"><a href="#cb4-291" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb4-292"><a href="#cb4-292" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb4-293"><a href="#cb4-293" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb4-294"><a href="#cb4-294" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;</span>
+<span id="cb4-295"><a href="#cb4-295" aria-hidden="true" tabindex="-1"></a>        &lt;img src="images/matmul1.png" alt='row_col' width='300'&gt;</span>
+<span id="cb4-296"><a href="#cb4-296" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb4-297"><a href="#cb4-297" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb4-298"><a href="#cb4-298" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb4-299"><a href="#cb4-299" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb4-300"><a href="#cb4-300" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>However, it is helpful sometimes to think of matrix-vector multiplication as performed by columns. We can also think of $\mathbb{Y}$ as a *linear combination of feature vectors*, scaled by *parameters*.</span>
+<span id="cb4-301"><a href="#cb4-301" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb4-302"><a href="#cb4-302" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb4-303"><a href="#cb4-303" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb4-304"><a href="#cb4-304" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;</span>
+<span id="cb4-305"><a href="#cb4-305" aria-hidden="true" tabindex="-1"></a>        &lt;img src="images/matmul2.png" alt='row_col' width='500'&gt;</span>
+<span id="cb4-306"><a href="#cb4-306" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb4-307"><a href="#cb4-307" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb4-308"><a href="#cb4-308" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb4-309"><a href="#cb4-309" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb4-310"><a href="#cb4-310" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb4-311"><a href="#cb4-311" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-312"><a href="#cb4-312" aria-hidden="true" tabindex="-1"></a>Up until now, we've mostly thought of our model as a scalar product between horizontally stacked observations and the parameter vector. We can also think of $\hat{\mathbb{Y}}$ as a **linear combination of feature vectors**, scaled by the **parameters**. We use the notation $\mathbb{X}_{:, i}$ to denote the $i$th column of the design matrix. You can think of this as following the same convention as used when calling <span class="in">`.iloc`</span> and <span class="in">`.loc`</span>. ":" means that we are taking all entries in the $i$th column.</span>
+<span id="cb4-313"><a href="#cb4-313" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-314"><a href="#cb4-314" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb4-315"><a href="#cb4-315" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb4-316"><a href="#cb4-316" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb4-317"><a href="#cb4-317" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;</span>
+<span id="cb4-318"><a href="#cb4-318" aria-hidden="true" tabindex="-1"></a>        &lt;img src="images/columns.png" alt='columns' width='500'&gt;</span>
+<span id="cb4-319"><a href="#cb4-319" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb4-320"><a href="#cb4-320" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb4-321"><a href="#cb4-321" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb4-322"><a href="#cb4-322" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb4-323"><a href="#cb4-323" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-324"><a href="#cb4-324" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb4-325"><a href="#cb4-325" aria-hidden="true" tabindex="-1"></a>\hat{\mathbb{Y}} = </span>
+<span id="cb4-326"><a href="#cb4-326" aria-hidden="true" tabindex="-1"></a>\theta_0 \begin{bmatrix}</span>
+<span id="cb4-327"><a href="#cb4-327" aria-hidden="true" tabindex="-1"></a>           1 <span class="sc">\\</span></span>
+<span id="cb4-328"><a href="#cb4-328" aria-hidden="true" tabindex="-1"></a>           1 <span class="sc">\\</span></span>
+<span id="cb4-329"><a href="#cb4-329" aria-hidden="true" tabindex="-1"></a>           \vdots <span class="sc">\\</span></span>
+<span id="cb4-330"><a href="#cb4-330" aria-hidden="true" tabindex="-1"></a>           1</span>
+<span id="cb4-331"><a href="#cb4-331" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix} + \theta_1 \begin{bmatrix}</span>
+<span id="cb4-332"><a href="#cb4-332" aria-hidden="true" tabindex="-1"></a>           x_{11} <span class="sc">\\</span></span>
+<span id="cb4-333"><a href="#cb4-333" aria-hidden="true" tabindex="-1"></a>           x_{21} <span class="sc">\\</span></span>
+<span id="cb4-334"><a href="#cb4-334" aria-hidden="true" tabindex="-1"></a>           \vdots <span class="sc">\\</span></span>
+<span id="cb4-335"><a href="#cb4-335" aria-hidden="true" tabindex="-1"></a>           x_{n1}</span>
+<span id="cb4-336"><a href="#cb4-336" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix} + \ldots + \theta_p \begin{bmatrix}</span>
+<span id="cb4-337"><a href="#cb4-337" aria-hidden="true" tabindex="-1"></a>           x_{1p} <span class="sc">\\</span></span>
+<span id="cb4-338"><a href="#cb4-338" aria-hidden="true" tabindex="-1"></a>           x_{2p} <span class="sc">\\</span></span>
+<span id="cb4-339"><a href="#cb4-339" aria-hidden="true" tabindex="-1"></a>           \vdots <span class="sc">\\</span></span>
+<span id="cb4-340"><a href="#cb4-340" aria-hidden="true" tabindex="-1"></a>           x_{np}</span>
+<span id="cb4-341"><a href="#cb4-341" aria-hidden="true" tabindex="-1"></a>         \end{bmatrix}</span>
+<span id="cb4-342"><a href="#cb4-342" aria-hidden="true" tabindex="-1"></a>         = \theta_0 \mathbb{X}_{:,\:1} + \theta_1 \mathbb{X}_{:,\:2} + \ldots + \theta_p \mathbb{X}_{:,\:p+1}$$</span>
+<span id="cb4-343"><a href="#cb4-343" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-344"><a href="#cb4-344" aria-hidden="true" tabindex="-1"></a>This new approach is useful because it allows us to take advantage of the properties of linear combinations.</span>
+<span id="cb4-345"><a href="#cb4-345" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-346"><a href="#cb4-346" aria-hidden="true" tabindex="-1"></a>Because the prediction vector, $\hat{\mathbb{Y}} = \mathbb{X} \theta$, is a **linear combination** of the columns of $\mathbb{X}$, we know that the **predictions are contained in the span of $\mathbb{X}$**. That is, we know that $\mathbb{\hat{Y}} \in \text{Span}(\mathbb{X})$.</span>
+<span id="cb4-347"><a href="#cb4-347" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-348"><a href="#cb4-348" aria-hidden="true" tabindex="-1"></a>The diagram below is a simplified view of $\text{Span}(\mathbb{X})$, assuming that each column of $\mathbb{X}$ has length $n$. Notice that the columns of $\mathbb{X}$ define a subspace of $\mathbb{R}^n$, where each point in the subspace can be reached by a linear combination of $\mathbb{X}$'s columns. The prediction vector $\mathbb{\hat{Y}}$ lies somewhere in this subspace.</span>
+<span id="cb4-349"><a href="#cb4-349" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-350"><a href="#cb4-350" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb4-351"><a href="#cb4-351" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb4-352"><a href="#cb4-352" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb4-353"><a href="#cb4-353" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;</span>
+<span id="cb4-354"><a href="#cb4-354" aria-hidden="true" tabindex="-1"></a>        &lt;img src="images/span.png" alt='span' width='600'&gt;</span>
+<span id="cb4-355"><a href="#cb4-355" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb4-356"><a href="#cb4-356" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb4-357"><a href="#cb4-357" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb4-358"><a href="#cb4-358" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb4-359"><a href="#cb4-359" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-360"><a href="#cb4-360" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-361"><a href="#cb4-361" aria-hidden="true" tabindex="-1"></a>Examining this diagram, we find a problem. The vector of true values, $\mathbb{Y}$, could theoretically lie *anywhere* in $\mathbb{R}^n$ space – its exact location depends on the data we collect out in the real world. However, our multiple linear regression model can only make predictions in the subspace of $\mathbb{R}^n$ spanned by $\mathbb{X}$. Remember the model fitting goal we established in the previous section: we want to generate predictions such that the distance between the vector of true values, $\mathbb{Y}$, and the vector of predicted values, $\mathbb{\hat{Y}}$, is minimized. This means that **we want $\mathbb{\hat{Y}}$ to be the vector in $\text{Span}(\mathbb{X})$ that is closest to $\mathbb{Y}$**. </span>
+<span id="cb4-362"><a href="#cb4-362" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-363"><a href="#cb4-363" aria-hidden="true" tabindex="-1"></a>Another way of rephrasing this goal is to say that we wish to minimize the length of the residual vector $e$, as measured by its $L_2$ norm. </span>
+<span id="cb4-364"><a href="#cb4-364" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-365"><a href="#cb4-365" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb4-366"><a href="#cb4-366" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb4-367"><a href="#cb4-367" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb4-368"><a href="#cb4-368" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;</span>
+<span id="cb4-369"><a href="#cb4-369" aria-hidden="true" tabindex="-1"></a>        &lt;img src="images/residual.png" alt='residual' width='600'&gt;</span>
+<span id="cb4-370"><a href="#cb4-370" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb4-371"><a href="#cb4-371" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb4-372"><a href="#cb4-372" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb4-373"><a href="#cb4-373" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb4-374"><a href="#cb4-374" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-375"><a href="#cb4-375" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-376"><a href="#cb4-376" aria-hidden="true" tabindex="-1"></a>The vector in $\text{Span}(\mathbb{X})$ that is closest to $\mathbb{Y}$ is always the **orthogonal projection** of $\mathbb{Y}$ onto $\text{Span}(\mathbb{X}).$ Thus, we should choose the parameter vector $\theta$ that makes the **residual vector orthogonal to any vector in $\text{Span}(\mathbb{X})$**. You can visualize this as the vector created by dropping a perpendicular line from $\mathbb{Y}$ onto the span of $\mathbb{X}$. </span>
+<span id="cb4-377"><a href="#cb4-377" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-378"><a href="#cb4-378" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collaps="false"}</span>
+<span id="cb4-379"><a href="#cb4-379" aria-hidden="true" tabindex="-1"></a><span class="fu">### Linear Algebra: Orthogonality</span></span>
+<span id="cb4-380"><a href="#cb4-380" aria-hidden="true" tabindex="-1"></a>Recall that two vectors $\vec{a}$ and $\vec{b}$ are orthogonal if their dot product is zero: $\vec{a}^{T}\vec{b} = 0$. </span>
+<span id="cb4-381"><a href="#cb4-381" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-382"><a href="#cb4-382" aria-hidden="true" tabindex="-1"></a>A vector $v$ is **orthogonal** to the span of a matrix $M$ if and only if $v$ is orthogonal to **each column** in $M$. Put together, a vector $v$ is orthogonal to $\text{Span}(M)$ if:</span>
+<span id="cb4-383"><a href="#cb4-383" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-384"><a href="#cb4-384" aria-hidden="true" tabindex="-1"></a>$$M^Tv = \vec{0}$$</span>
+<span id="cb4-385"><a href="#cb4-385" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-386"><a href="#cb4-386" aria-hidden="true" tabindex="-1"></a>Note that $\vec{0}$ represents the **zero vector**, a $d$-length vector full of 0s.</span>
+<span id="cb4-387"><a href="#cb4-387" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb4-388"><a href="#cb4-388" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-389"><a href="#cb4-389" aria-hidden="true" tabindex="-1"></a>Remember our goal is to find $\hat{\theta}$ such that we minimize the objective function $R(\theta)$. Equivalently, this is the $\hat{\theta}$ such that the residual vector $e = \mathbb{Y} - \mathbb{X} \hat{\theta}$ is orthogonal to $\text{Span}(\mathbb{X})$. </span>
+<span id="cb4-390"><a href="#cb4-390" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-391"><a href="#cb4-391" aria-hidden="true" tabindex="-1"></a>Looking at the definition of orthogonality of $\mathbb{Y} - \mathbb{X}\hat{\theta}$ to $span(\mathbb{X})$, we can write: </span>
+<span id="cb4-392"><a href="#cb4-392" aria-hidden="true" tabindex="-1"></a>$$\mathbb{X}^T (\mathbb{Y} - \mathbb{X}\hat{\theta}) = \vec{0}$$ </span>
+<span id="cb4-393"><a href="#cb4-393" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-394"><a href="#cb4-394" aria-hidden="true" tabindex="-1"></a>Let's then rearrange the terms: </span>
+<span id="cb4-395"><a href="#cb4-395" aria-hidden="true" tabindex="-1"></a>$$\mathbb{X}^T \mathbb{Y} - \mathbb{X}^T \mathbb{X} \hat{\theta} = \vec{0}$$</span>
+<span id="cb4-396"><a href="#cb4-396" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-397"><a href="#cb4-397" aria-hidden="true" tabindex="-1"></a>And finally, we end up with the **normal equation**:</span>
+<span id="cb4-398"><a href="#cb4-398" aria-hidden="true" tabindex="-1"></a>$$\mathbb{X}^T \mathbb{X} \hat{\theta} = \mathbb{X}^T \mathbb{Y}$$</span>
+<span id="cb4-399"><a href="#cb4-399" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-400"><a href="#cb4-400" aria-hidden="true" tabindex="-1"></a>Any vector $\theta$ that minimizes MSE on a dataset must satisfy this equation.</span>
+<span id="cb4-401"><a href="#cb4-401" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-402"><a href="#cb4-402" aria-hidden="true" tabindex="-1"></a>If $\mathbb{X}^T \mathbb{X}$ is invertible, we can conclude:</span>
+<span id="cb4-403"><a href="#cb4-403" aria-hidden="true" tabindex="-1"></a>$$\hat{\theta} = (\mathbb{X}^T \mathbb{X})^{-1} \mathbb{X}^T \mathbb{Y}$$</span>
+<span id="cb4-404"><a href="#cb4-404" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-405"><a href="#cb4-405" aria-hidden="true" tabindex="-1"></a>This is called the **least squares estimate** of $\theta$: it is the value of $\theta$ that minimizes the squared loss. </span>
+<span id="cb4-406"><a href="#cb4-406" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-407"><a href="#cb4-407" aria-hidden="true" tabindex="-1"></a>Note that the least squares estimate was derived under the assumption that $\mathbb{X}^T \mathbb{X}$ is *invertible*. This condition holds true when $\mathbb{X}^T \mathbb{X}$ is full column rank, which, in turn, happens when $\mathbb{X}$ is full column rank. The proof for why $\mathbb{X}$ needs to be full column rank is optional and in the Bonus section at the end.</span>
+<span id="cb4-408"><a href="#cb4-408" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-409"><a href="#cb4-409" aria-hidden="true" tabindex="-1"></a><span class="fu">## Evaluating Model Performance</span></span>
+<span id="cb4-410"><a href="#cb4-410" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-411"><a href="#cb4-411" aria-hidden="true" tabindex="-1"></a>Our geometric view of multiple linear regression has taken us far! We have identified the optimal set of parameter values to minimize MSE in a model of multiple features. Now, we want to understand how well our fitted model performs. </span>
+<span id="cb4-412"><a href="#cb4-412" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-413"><a href="#cb4-413" aria-hidden="true" tabindex="-1"></a><span class="fu">### RMSE</span></span>
+<span id="cb4-414"><a href="#cb4-414" aria-hidden="true" tabindex="-1"></a>One measure of model performance is the **Root Mean Squared Error**, or RMSE. The RMSE is simply the square root of MSE. Taking the square root converts the value back into the original, non-squared units of $y_i$, which is useful for understanding the model's performance. A low RMSE indicates more "accurate" predictions – that there is a lower average loss across the dataset.</span>
+<span id="cb4-415"><a href="#cb4-415" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-416"><a href="#cb4-416" aria-hidden="true" tabindex="-1"></a>$$\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2}$$</span>
+<span id="cb4-417"><a href="#cb4-417" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-418"><a href="#cb4-418" aria-hidden="true" tabindex="-1"></a><span class="fu">### Residual Plots</span></span>
+<span id="cb4-419"><a href="#cb4-419" aria-hidden="true" tabindex="-1"></a>When working with SLR, we generated plots of the residuals against a single feature to understand the behavior of residuals. When working with several features in multiple linear regression, it no longer makes sense to consider a single feature in our residual plots. Instead, multiple linear regression is evaluated by making plots of the residuals against the predicted values. As was the case with SLR, a multiple linear model performs well if its residual plot shows no patterns.</span>
+<span id="cb4-420"><a href="#cb4-420" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-421"><a href="#cb4-421" aria-hidden="true" tabindex="-1"></a>&lt;div align="middle"&gt;</span>
+<span id="cb4-422"><a href="#cb4-422" aria-hidden="true" tabindex="-1"></a>  &lt;table style="width:100%"&gt;</span>
+<span id="cb4-423"><a href="#cb4-423" aria-hidden="true" tabindex="-1"></a>    &lt;tr align="center"&gt;</span>
+<span id="cb4-424"><a href="#cb4-424" aria-hidden="true" tabindex="-1"></a>      &lt;td&gt;</span>
+<span id="cb4-425"><a href="#cb4-425" aria-hidden="true" tabindex="-1"></a>        &lt;img src="images/residual_plot.png" alt='residual_plot' width='500'&gt;</span>
+<span id="cb4-426"><a href="#cb4-426" aria-hidden="true" tabindex="-1"></a>      &lt;/td&gt;</span>
+<span id="cb4-427"><a href="#cb4-427" aria-hidden="true" tabindex="-1"></a>    &lt;/tr&gt;</span>
+<span id="cb4-428"><a href="#cb4-428" aria-hidden="true" tabindex="-1"></a>  &lt;/table&gt;</span>
+<span id="cb4-429"><a href="#cb4-429" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt;</span>
+<span id="cb4-430"><a href="#cb4-430" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-431"><a href="#cb4-431" aria-hidden="true" tabindex="-1"></a><span class="fu">### Multiple $R^2$</span></span>
+<span id="cb4-432"><a href="#cb4-432" aria-hidden="true" tabindex="-1"></a>For SLR, we used the correlation coefficient to capture the association between the target variable and a single feature variable. In a multiple linear model setting, we will need a performance metric that can account for multiple features at once. **Multiple $R^2$**, also called the **coefficient of determination**, is the **proportion of variance** of our **fitted values** (predictions) $\hat{y}_i$ to our true values $y_i$. It ranges from 0 to 1 and is effectively the *proportion* of variance in the observations that the **model explains**. </span>
+<span id="cb4-433"><a href="#cb4-433" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-434"><a href="#cb4-434" aria-hidden="true" tabindex="-1"></a>$$R^2 = \frac{\text{variance of } \hat{y}_i}{\text{variance of } y_i} = \frac{\sigma^2_{\hat{y}}}{\sigma^2_y}$$</span>
+<span id="cb4-435"><a href="#cb4-435" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-436"><a href="#cb4-436" aria-hidden="true" tabindex="-1"></a>Note that for OLS with an intercept term, for example $\hat{y} = \theta_0 + \theta_1x_1 + \theta_2x_2 + \cdots + \theta_px_p$, $R^2$ is equal to the square of the correlation between $y$ and $\hat{y}$. On the other hand for SLR, $R^2$ is equal to $r^2$, the correlation between $x$ and $y$. The proof of these last two properties is out of scope for this course.</span>
+<span id="cb4-437"><a href="#cb4-437" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-438"><a href="#cb4-438" aria-hidden="true" tabindex="-1"></a>Additionally, as we add more features, our fitted values tend to become closer and closer to our actual values. Thus, $R^2$ increases.</span>
+<span id="cb4-439"><a href="#cb4-439" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-440"><a href="#cb4-440" aria-hidden="true" tabindex="-1"></a>Adding more features doesn't always mean our model is better though! We'll see why later in the course.</span>
+<span id="cb4-441"><a href="#cb4-441" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-442"><a href="#cb4-442" aria-hidden="true" tabindex="-1"></a><span class="fu">## OLS Properties</span></span>
+<span id="cb4-443"><a href="#cb4-443" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-444"><a href="#cb4-444" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>When using the optimal parameter vector, our residuals $e = \mathbb{Y} - \hat{\mathbb{Y}}$ are orthogonal to $span(\mathbb{X})$.</span>
+<span id="cb4-445"><a href="#cb4-445" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-446"><a href="#cb4-446" aria-hidden="true" tabindex="-1"></a>$$\mathbb{X}^Te = 0 $$</span>
+<span id="cb4-447"><a href="#cb4-447" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-448"><a href="#cb4-448" aria-hidden="true" tabindex="-1"></a>::: {.callout}</span>
+<span id="cb4-449"><a href="#cb4-449" aria-hidden="true" tabindex="-1"></a>Proof: </span>
+<span id="cb4-450"><a href="#cb4-450" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-451"><a href="#cb4-451" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>The optimal parameter vector, $\hat{\theta}$, solves the normal equations $\implies \hat{\theta} = (\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y}$</span>
+<span id="cb4-452"><a href="#cb4-452" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-453"><a href="#cb4-453" aria-hidden="true" tabindex="-1"></a>$$\mathbb{X}^Te = \mathbb{X}^T (\mathbb{Y} - \mathbb{\hat{Y}}) $$</span>
+<span id="cb4-454"><a href="#cb4-454" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-455"><a href="#cb4-455" aria-hidden="true" tabindex="-1"></a>$$\mathbb{X}^T (\mathbb{Y} - \mathbb{X}\hat{\theta}) = \mathbb{X}^T\mathbb{Y} - \mathbb{X}^T\mathbb{X}\hat{\theta}$$</span>
+<span id="cb4-456"><a href="#cb4-456" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-457"><a href="#cb4-457" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Any matrix multiplied with its own inverse is the identity matrix $\mathbb{I}$</span>
+<span id="cb4-458"><a href="#cb4-458" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-459"><a href="#cb4-459" aria-hidden="true" tabindex="-1"></a>$$\mathbb{X}^T\mathbb{Y} - (\mathbb{X}^T\mathbb{X})(\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y} = \mathbb{X}^T\mathbb{Y} - \mathbb{X}^T\mathbb{Y} = 0$$</span>
+<span id="cb4-460"><a href="#cb4-460" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb4-461"><a href="#cb4-461" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-462"><a href="#cb4-462" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>For all linear models with an **intercept term**, the **sum of residuals is zero**.</span>
+<span id="cb4-463"><a href="#cb4-463" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-464"><a href="#cb4-464" aria-hidden="true" tabindex="-1"></a>$$\sum_i^n e_i = 0$$</span>
+<span id="cb4-465"><a href="#cb4-465" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-466"><a href="#cb4-466" aria-hidden="true" tabindex="-1"></a>::: {.callout}</span>
+<span id="cb4-467"><a href="#cb4-467" aria-hidden="true" tabindex="-1"></a>Proof:</span>
+<span id="cb4-468"><a href="#cb4-468" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-469"><a href="#cb4-469" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>For all linear models with an **intercept term**, the average of the predicted $y$ values is equal to the average of the true $y$ values.</span>
+<span id="cb4-470"><a href="#cb4-470" aria-hidden="true" tabindex="-1"></a>$$\bar{y} = \bar{\hat{y}}$$</span>
+<span id="cb4-471"><a href="#cb4-471" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Rewriting the sum of residuals as two separate sums,</span>
+<span id="cb4-472"><a href="#cb4-472" aria-hidden="true" tabindex="-1"></a>$$\sum_i^n e_i = \sum_i^n y_i - \sum_i^n\hat{y}_i$$</span>
+<span id="cb4-473"><a href="#cb4-473" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Each respective sum is a multiple of the average of the sum.</span>
+<span id="cb4-474"><a href="#cb4-474" aria-hidden="true" tabindex="-1"></a>$$\sum_i^n e_i = n\bar{y} - n\bar{y} = n(\bar{y} - \bar{y}) = 0$$</span>
+<span id="cb4-475"><a href="#cb4-475" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb4-476"><a href="#cb4-476" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-477"><a href="#cb4-477" aria-hidden="true" tabindex="-1"></a>To summarize:</span>
+<span id="cb4-478"><a href="#cb4-478" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-479"><a href="#cb4-479" aria-hidden="true" tabindex="-1"></a>|   | Model | Estimate | Unique? |</span>
+<span id="cb4-480"><a href="#cb4-480" aria-hidden="true" tabindex="-1"></a>| -- | -- | -- |  -- | </span>
+<span id="cb4-481"><a href="#cb4-481" aria-hidden="true" tabindex="-1"></a>| Constant Model + MSE | $\hat{y} = \theta_0$| $\hat{\theta}_0 = mean(y) = \bar{y}$ | **Yes**. Any set of values has a unique mean.</span>
+<span id="cb4-482"><a href="#cb4-482" aria-hidden="true" tabindex="-1"></a>| Constant Model + MAE | $\hat{y} = \theta_0$  | $\hat{\theta}_0 = median(y)$ | **Yes**, if odd. **No**, if even. Return the average of the middle 2 values.</span>
+<span id="cb4-483"><a href="#cb4-483" aria-hidden="true" tabindex="-1"></a>| Simple Linear Regression + MSE | $\hat{y} = \theta_0 + \theta_1x$| $\hat{\theta}_0 = \bar{y} - \hat{\theta}_1\bar{x}$ $\hat{\theta}_1 = r\frac{\sigma_y}{\sigma_x}$| **Yes**. Any set of non-constant* values has a unique mean, SD, and correlation coefficient.</span>
+<span id="cb4-484"><a href="#cb4-484" aria-hidden="true" tabindex="-1"></a>| **OLS** (Linear Model + MSE) | $\mathbb{\hat{Y}} = \mathbb{X}\mathbb{\theta}$| $\hat{\theta} = (\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y}$  | **Yes**, if $\mathbb{X}$ is full column rank (all columns are linearly independent, # of datapoints &gt;&gt;&gt; # of features).</span>
+<span id="cb4-485"><a href="#cb4-485" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-486"><a href="#cb4-486" aria-hidden="true" tabindex="-1"></a><span class="fu">## Bonus: Uniqueness of the Solution</span></span>
+<span id="cb4-487"><a href="#cb4-487" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-488"><a href="#cb4-488" aria-hidden="true" tabindex="-1"></a>The Least Squares estimate $\hat{\theta}$ is **unique** if and only if $\mathbb{X}$ is **full column rank**.</span>
+<span id="cb4-489"><a href="#cb4-489" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-490"><a href="#cb4-490" aria-hidden="true" tabindex="-1"></a>::: {.callout}</span>
+<span id="cb4-491"><a href="#cb4-491" aria-hidden="true" tabindex="-1"></a>Proof: </span>
+<span id="cb4-492"><a href="#cb4-492" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-493"><a href="#cb4-493" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>We know the solution to the normal equation $\mathbb{X}^T\mathbb{X}\hat{\theta} = \mathbb{X}^T\mathbb{Y}$ is the least square estimate that minimizes the squared loss.</span>
+<span id="cb4-494"><a href="#cb4-494" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\hat{\theta}$ has a **unique** solution $\iff$ the square matrix $\mathbb{X}^T\mathbb{X}$ is **invertible** $\iff$ $\mathbb{X}^T\mathbb{X}$ is full rank.</span>
+<span id="cb4-495"><a href="#cb4-495" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>The **column** rank of a square matrix is the max number of linearly independent columns it contains.</span>
+<span id="cb4-496"><a href="#cb4-496" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>An $n$ x $n$ square matrix is deemed full column rank when all of its columns are linearly independent. That is, its rank would be equal to $n$.</span>
+<span id="cb4-497"><a href="#cb4-497" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$\mathbb{X}^T\mathbb{X}$ has shape $(p + 1) \times (p + 1)$, and therefore has max rank $p + 1$. </span>
+<span id="cb4-498"><a href="#cb4-498" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$rank(\mathbb{X}^T\mathbb{X})$ = $rank(\mathbb{X})$ (proof out of scope).</span>
+<span id="cb4-499"><a href="#cb4-499" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Therefore, $\mathbb{X}^T\mathbb{X}$ has rank $p + 1$ $\iff$  $\mathbb{X}$ has rank $p + 1$ $\iff \mathbb{X}$ is full column rank.</span>
+<span id="cb4-500"><a href="#cb4-500" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb4-501"><a href="#cb4-501" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-502"><a href="#cb4-502" aria-hidden="true" tabindex="-1"></a>Therefore, if $\mathbb{X}$ is not full column rank, we will not have unique estimates. This can happen for two major reasons.</span>
+<span id="cb4-503"><a href="#cb4-503" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-504"><a href="#cb4-504" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>If our design matrix $\mathbb{X}$ is "**wide**":</span>
+<span id="cb4-505"><a href="#cb4-505" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>If n &lt; p, then we have way more features (columns) than observations (rows).</span>
+<span id="cb4-506"><a href="#cb4-506" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Then $rank(\mathbb{X})$ = min(n, p+1) &lt; p+1, so $\hat{\theta}$ is not unique.</span>
+<span id="cb4-507"><a href="#cb4-507" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Typically we have n &gt;&gt; p so this is less of an issue.</span>
+<span id="cb4-508"><a href="#cb4-508" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-509"><a href="#cb4-509" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>If our design matrix $\mathbb{X}$ has features that are **linear combinations** of other features:</span>
+<span id="cb4-510"><a href="#cb4-510" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>By definition, rank of $\mathbb{X}$ is number of linearly independent columns in $\mathbb{X}$.</span>
+<span id="cb4-511"><a href="#cb4-511" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Example: If “Width”, “Height”, and “Perimeter” are all columns,</span>
+<span id="cb4-512"><a href="#cb4-512" aria-hidden="true" tabindex="-1"></a><span class="ss">      * </span>Perimeter = 2 * Width + 2 * Height  $\rightarrow$  $\mathbb{X}$ is not full rank.</span>
+<span id="cb4-513"><a href="#cb4-513" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Important with one-hot encoding (to discuss later).</span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/pandas_1/images/df_elections.png b/docs/pandas_1/images/df_elections.png
new file mode 100644
index 000000000..224087bf6
Binary files /dev/null and b/docs/pandas_1/images/df_elections.png differ
diff --git a/docs/pandas_1/images/locgraphic.png b/docs/pandas_1/images/locgraphic.png
new file mode 100644
index 000000000..b37e8422e
Binary files /dev/null and b/docs/pandas_1/images/locgraphic.png differ
diff --git a/docs/pandas_1/images/non-uniqueindex.png b/docs/pandas_1/images/non-uniqueindex.png
new file mode 100644
index 000000000..64ab25a3e
Binary files /dev/null and b/docs/pandas_1/images/non-uniqueindex.png differ
diff --git a/docs/pandas_1/images/row_col.png b/docs/pandas_1/images/row_col.png
new file mode 100644
index 000000000..f9e5faded
Binary files /dev/null and b/docs/pandas_1/images/row_col.png differ
diff --git a/docs/pandas_1/images/uniqueindex.png b/docs/pandas_1/images/uniqueindex.png
new file mode 100644
index 000000000..e95341f30
Binary files /dev/null and b/docs/pandas_1/images/uniqueindex.png differ
diff --git a/docs/pandas_1/pandas_1.html b/docs/pandas_1/pandas_1.html
new file mode 100644
index 000000000..0b7682332
--- /dev/null
+++ b/docs/pandas_1/pandas_1.html
@@ -0,0 +1,2724 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>2&nbsp; Pandas I – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../pandas_2/pandas_2.html" rel="next">
+<link href="../intro_lec/introduction.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../pandas_1/pandas_1.html"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#tabular-data" id="toc-tabular-data" class="nav-link active" data-scroll-target="#tabular-data"><span class="header-section-number">2.1</span> Tabular Data</a></li>
+  <li><a href="#series-dataframes-and-indices" id="toc-series-dataframes-and-indices" class="nav-link" data-scroll-target="#series-dataframes-and-indices"><span class="header-section-number">2.2</span> <code>Series</code>, <code>DataFrame</code>s, and Indices</a>
+  <ul>
+  <li><a href="#series" id="toc-series" class="nav-link" data-scroll-target="#series"><span class="header-section-number">2.2.1</span> Series</a>
+  <ul>
+  <li><a href="#selection-in-series" id="toc-selection-in-series" class="nav-link" data-scroll-target="#selection-in-series"><span class="header-section-number">2.2.1.1</span> Selection in <code>Series</code></a>
+  <ul class="collapse">
+  <li><a href="#a-single-label" id="toc-a-single-label" class="nav-link" data-scroll-target="#a-single-label"><span class="header-section-number">2.2.1.1.1</span> A Single Label</a></li>
+  <li><a href="#a-list-of-labels" id="toc-a-list-of-labels" class="nav-link" data-scroll-target="#a-list-of-labels"><span class="header-section-number">2.2.1.1.2</span> A List of Labels</a></li>
+  <li><a href="#a-filtering-condition" id="toc-a-filtering-condition" class="nav-link" data-scroll-target="#a-filtering-condition"><span class="header-section-number">2.2.1.1.3</span> A Filtering Condition</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#dataframes" id="toc-dataframes" class="nav-link" data-scroll-target="#dataframes"><span class="header-section-number">2.2.2</span> <code>DataFrames</code></a>
+  <ul>
+  <li><a href="#creating-a-dataframe" id="toc-creating-a-dataframe" class="nav-link" data-scroll-target="#creating-a-dataframe"><span class="header-section-number">2.2.2.1</span> Creating a <code>DataFrame</code></a>
+  <ul class="collapse">
+  <li><a href="#from-a-csv-file" id="toc-from-a-csv-file" class="nav-link" data-scroll-target="#from-a-csv-file"><span class="header-section-number">2.2.2.1.1</span> From a CSV file</a></li>
+  <li><a href="#using-a-list-and-column-names" id="toc-using-a-list-and-column-names" class="nav-link" data-scroll-target="#using-a-list-and-column-names"><span class="header-section-number">2.2.2.1.2</span> Using a List and Column Name(s)</a></li>
+  <li><a href="#from-a-dictionary" id="toc-from-a-dictionary" class="nav-link" data-scroll-target="#from-a-dictionary"><span class="header-section-number">2.2.2.1.3</span> From a Dictionary</a></li>
+  <li><a href="#from-a-series" id="toc-from-a-series" class="nav-link" data-scroll-target="#from-a-series"><span class="header-section-number">2.2.2.1.4</span> From a <code>Series</code></a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#indices" id="toc-indices" class="nav-link" data-scroll-target="#indices"><span class="header-section-number">2.2.3</span> Indices</a></li>
+  </ul></li>
+  <li><a href="#dataframe-attributes-index-columns-and-shape" id="toc-dataframe-attributes-index-columns-and-shape" class="nav-link" data-scroll-target="#dataframe-attributes-index-columns-and-shape"><span class="header-section-number">2.3</span> <code>DataFrame</code> Attributes: Index, Columns, and Shape</a></li>
+  <li><a href="#slicing-in-dataframes" id="toc-slicing-in-dataframes" class="nav-link" data-scroll-target="#slicing-in-dataframes"><span class="header-section-number">2.4</span> Slicing in <code>DataFrame</code>s</a>
+  <ul>
+  <li><a href="#extracting-data-with-.head-and-.tail" id="toc-extracting-data-with-.head-and-.tail" class="nav-link" data-scroll-target="#extracting-data-with-.head-and-.tail"><span class="header-section-number">2.4.1</span> Extracting data with <code>.head</code> and <code>.tail</code></a></li>
+  <li><a href="#label-based-extraction-indexing-with-.loc" id="toc-label-based-extraction-indexing-with-.loc" class="nav-link" data-scroll-target="#label-based-extraction-indexing-with-.loc"><span class="header-section-number">2.4.2</span> Label-based Extraction: Indexing with <code>.loc</code></a></li>
+  <li><a href="#integer-based-extraction-indexing-with-.iloc" id="toc-integer-based-extraction-indexing-with-.iloc" class="nav-link" data-scroll-target="#integer-based-extraction-indexing-with-.iloc"><span class="header-section-number">2.4.3</span> Integer-based Extraction: Indexing with <code>.iloc</code></a></li>
+  <li><a href="#context-dependent-extraction-indexing-with" id="toc-context-dependent-extraction-indexing-with" class="nav-link" data-scroll-target="#context-dependent-extraction-indexing-with"><span class="header-section-number">2.4.4</span> Context-dependent Extraction: Indexing with <code>[]</code></a>
+  <ul>
+  <li><a href="#a-slice-of-row-numbers" id="toc-a-slice-of-row-numbers" class="nav-link" data-scroll-target="#a-slice-of-row-numbers"><span class="header-section-number">2.4.4.1</span> A slice of row numbers</a></li>
+  <li><a href="#a-list-of-column-labels" id="toc-a-list-of-column-labels" class="nav-link" data-scroll-target="#a-list-of-column-labels"><span class="header-section-number">2.4.4.2</span> A list of column labels</a></li>
+  <li><a href="#a-single-column-label" id="toc-a-single-column-label" class="nav-link" data-scroll-target="#a-single-column-label"><span class="header-section-number">2.4.4.3</span> A single-column label</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#parting-note" id="toc-parting-note" class="nav-link" data-scroll-target="#parting-note"><span class="header-section-number">2.5</span> Parting Note</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Build familiarity with <code>pandas</code> and <code>pandas</code> syntax.</li>
+<li>Learn key data structures: <code>DataFrame</code>, <code>Series</code>, and <code>Index</code>.</li>
+<li>Understand methods for extracting data: <code>.loc</code>, <code>.iloc</code>, and <code>[]</code>.</li>
+</ul>
+</div>
+</div>
+</div>
+<p>In this sequence of lectures, we will dive right into things by having you explore and manipulate real-world data. We’ll first introduce <code>pandas</code>, a popular Python library for interacting with <strong>tabular data</strong>.</p>
+<section id="tabular-data" class="level2" data-number="2.1">
+<h2 data-number="2.1" class="anchored" data-anchor-id="tabular-data"><span class="header-section-number">2.1</span> Tabular Data</h2>
+<p>Data scientists work with data stored in a variety of formats. This class focuses primarily on <em>tabular data</em> — data that is stored in a table.</p>
+<p>Tabular data is one of the most common systems that data scientists use to organize data. This is in large part due to the simplicity and flexibility of tables. Tables allow us to represent each <strong>observation</strong>, or instance of collecting data from an individual, as its own <em>row</em>. We can record each observation’s distinct characteristics, or <strong>features</strong>, in separate <em>columns</em>.</p>
+<p>To see this in action, we’ll explore the <code>elections</code> dataset, which stores information about political candidates who ran for president of the United States in previous years.</p>
+<p>In the <code>elections</code> dataset, each row (blue box) represents one instance of a candidate running for president in a particular year. For example, the first row represents Andrew Jackson running for president in the year 1824. Each column (yellow box) represents one characteristic piece of information about each presidential candidate. For example, the column named “Result” stores whether or not the candidate won the election.</p>
+<center>
+<img src="images/row_col.png" width="600">
+</center>
+<p>Your work in Data 8 helped you grow very familiar with using and interpreting data stored in a tabular format. Back then, you used the <code>Table</code> class of the <code>datascience</code> library, a special programming library created specifically for Data 8 students.</p>
+<p>In Data 100, we will be working with the programming library <code>pandas</code>, which is generally accepted in the data science community as the industry- and academia-standard tool for manipulating tabular data (as well as the inspiration for Petey, our panda bear mascot).</p>
+<p>Using <code>pandas</code>, we can</p>
+<ul>
+<li>Arrange data in a tabular format.</li>
+<li>Extract useful information filtered by specific conditions.</li>
+<li>Operate on data to gain new insights.</li>
+<li>Apply <code>NumPy</code> functions to our data (our friends from Data 8).</li>
+<li>Perform vectorized computations to speed up our analysis (Lab 1).</li>
+</ul>
+</section>
+<section id="series-dataframes-and-indices" class="level2" data-number="2.2">
+<h2 data-number="2.2" class="anchored" data-anchor-id="series-dataframes-and-indices"><span class="header-section-number">2.2</span> <code>Series</code>, <code>DataFrame</code>s, and Indices</h2>
+<p>To begin our work in <code>pandas</code>, we must first import the library into our Python environment. This will allow us to use <code>pandas</code> data structures and methods in our code.</p>
+<div id="7cccbfb8" class="cell" data-execution_count="1">
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># `pd` is the conventional alias for Pandas, as `np` is for NumPy</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>There are three fundamental data structures in <code>pandas</code>:</p>
+<ol type="1">
+<li><strong><code>Series</code></strong>: 1D labeled array data; best thought of as columnar data.</li>
+<li><strong><code>DataFrame</code></strong>: 2D tabular data with rows and columns.</li>
+<li><strong><code>Index</code></strong>: A sequence of row/column labels.</li>
+</ol>
+<p><code>DataFrame</code>s, <code>Series</code>, and Indices can be represented visually in the following diagram, which considers the first few rows of the <code>elections</code> dataset.</p>
+<center>
+<img src="images/df_elections.png" width="600">
+</center>
+<p>Notice how the <strong>DataFrame</strong> is a two-dimensional object — it contains both rows and columns. The <strong>Series</strong> above is a singular column of this <code>DataFrame</code>, namely the <code>Result</code> column. Both contain an <strong>Index</strong>, or a shared list of row labels (the integers from 0 to 4, inclusive).</p>
+<section id="series" class="level3" data-number="2.2.1">
+<h3 data-number="2.2.1" class="anchored" data-anchor-id="series"><span class="header-section-number">2.2.1</span> Series</h3>
+<p>A <code>Series</code> represents a column of a <code>DataFrame</code>; more generally, it can be any 1-dimensional array-like object. It contains both:</p>
+<ul>
+<li>A sequence of <strong>values</strong> of the same type.</li>
+<li>A sequence of data labels called the <strong>index</strong>.</li>
+</ul>
+<p>In the cell below, we create a <code>Series</code> named <code>s</code>.</p>
+<div id="ea7204a5" class="cell" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>s <span class="op">=</span> pd.Series([<span class="st">"welcome"</span>, <span class="st">"to"</span>, <span class="st">"data 100"</span>])</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>s</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="2">
+<pre><code>0     welcome
+1          to
+2    data 100
+dtype: object</code></pre>
+</div>
+</div>
+<div id="9810cc08" class="cell" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a> <span class="co"># Accessing data values within the Series</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a> s.values</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<pre><code>array(['welcome', 'to', 'data 100'], dtype=object)</code></pre>
+</div>
+</div>
+<div id="a0255147" class="cell" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a> <span class="co"># Accessing the Index of the Series</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a> s.index</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<pre><code>RangeIndex(start=0, stop=3, step=1)</code></pre>
+</div>
+</div>
+<p>By default, the <code>index</code> of a <code>Series</code> is a sequential list of integers beginning from 0. Optionally, a manually specified list of desired indices can be passed to the <code>index</code> argument.</p>
+<div id="6e8b1a94" class="cell" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>s <span class="op">=</span> pd.Series([<span class="op">-</span><span class="dv">1</span>, <span class="dv">10</span>, <span class="dv">2</span>], index <span class="op">=</span> [<span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>])</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>s</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<pre><code>a    -1
+b    10
+c     2
+dtype: int64</code></pre>
+</div>
+</div>
+<div id="826d0f8b" class="cell" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>s.index</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<pre><code>Index(['a', 'b', 'c'], dtype='object')</code></pre>
+</div>
+</div>
+<p>Indices can also be changed after initialization.</p>
+<div id="0cefb0da" class="cell" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>s.index <span class="op">=</span> [<span class="st">"first"</span>, <span class="st">"second"</span>, <span class="st">"third"</span>]</span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>s</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<pre><code>first     -1
+second    10
+third      2
+dtype: int64</code></pre>
+</div>
+</div>
+<div id="a448a579" class="cell" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>s.index</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<pre><code>Index(['first', 'second', 'third'], dtype='object')</code></pre>
+</div>
+</div>
+<section id="selection-in-series" class="level4" data-number="2.2.1.1">
+<h4 data-number="2.2.1.1" class="anchored" data-anchor-id="selection-in-series"><span class="header-section-number">2.2.1.1</span> Selection in <code>Series</code></h4>
+<p>Much like when working with <code>NumPy</code> arrays, we can select a single value or a set of values from a <code>Series</code>. To do so, there are three primary methods:</p>
+<ol type="1">
+<li>A single label.</li>
+<li>A list of labels.</li>
+<li>A filtering condition.</li>
+</ol>
+<p>To demonstrate this, let’s define the Series <code>ser</code>.</p>
+<div id="775c04ef" class="cell" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>ser <span class="op">=</span> pd.Series([<span class="dv">4</span>, <span class="op">-</span><span class="dv">2</span>, <span class="dv">0</span>, <span class="dv">6</span>], index <span class="op">=</span> [<span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>])</span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>ser</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<pre><code>a    4
+b   -2
+c    0
+d    6
+dtype: int64</code></pre>
+</div>
+</div>
+<section id="a-single-label" class="level5" data-number="2.2.1.1.1">
+<h5 data-number="2.2.1.1.1" class="anchored" data-anchor-id="a-single-label"><span class="header-section-number">2.2.1.1.1</span> A Single Label</h5>
+<div id="da3a2fdc" class="cell" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="co"># We return the value stored at the index label "a"</span></span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>ser[<span class="st">"a"</span>] </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="10">
+<pre><code>np.int64(4)</code></pre>
+</div>
+</div>
+</section>
+<section id="a-list-of-labels" class="level5" data-number="2.2.1.1.2">
+<h5 data-number="2.2.1.1.2" class="anchored" data-anchor-id="a-list-of-labels"><span class="header-section-number">2.2.1.1.2</span> A List of Labels</h5>
+<div id="c34f8f2a" class="cell" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="co"># We return a Series of the values stored at the index labels "a" and "c"</span></span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>ser[[<span class="st">"a"</span>, <span class="st">"c"</span>]] </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="11">
+<pre><code>a    4
+c    0
+dtype: int64</code></pre>
+</div>
+</div>
+</section>
+<section id="a-filtering-condition" class="level5" data-number="2.2.1.1.3">
+<h5 data-number="2.2.1.1.3" class="anchored" data-anchor-id="a-filtering-condition"><span class="header-section-number">2.2.1.1.3</span> A Filtering Condition</h5>
+<p>Perhaps the most interesting (and useful) method of selecting data from a <code>Series</code> is by using a filtering condition.</p>
+<p>First, we apply a boolean operation to the <code>Series</code>. This creates <strong>a new <code>Series</code> of boolean values</strong>.</p>
+<div id="440adc90" class="cell" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Filter condition: select all elements greater than 0</span></span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>ser <span class="op">&gt;</span> <span class="dv">0</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="12">
+<pre><code>a     True
+b    False
+c    False
+d     True
+dtype: bool</code></pre>
+</div>
+</div>
+<p>We then use this boolean condition to index into our original <code>Series</code>. <code>pandas</code> will select only the entries in the original <code>Series</code> that satisfy the condition.</p>
+<div id="9445c1c9" class="cell" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>ser[ser <span class="op">&gt;</span> <span class="dv">0</span>] </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="13">
+<pre><code>a    4
+d    6
+dtype: int64</code></pre>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="dataframes" class="level3" data-number="2.2.2">
+<h3 data-number="2.2.2" class="anchored" data-anchor-id="dataframes"><span class="header-section-number">2.2.2</span> <code>DataFrames</code></h3>
+<p>Typically, we will work with <code>Series</code> using the perspective that they are columns in a <code>DataFrame</code>. We can think of a <strong><code>DataFrame</code></strong> as a collection of <strong><code>Series</code></strong> that all share the same <strong><code>Index</code></strong>.</p>
+<p>In Data 8, you encountered the <code>Table</code> class of the <code>datascience</code> library, which represented tabular data. In Data 100, we’ll be using the <code>DataFrame</code> class of the <code>pandas</code> library.</p>
+<section id="creating-a-dataframe" class="level4" data-number="2.2.2.1">
+<h4 data-number="2.2.2.1" class="anchored" data-anchor-id="creating-a-dataframe"><span class="header-section-number">2.2.2.1</span> Creating a <code>DataFrame</code></h4>
+<p>There are many ways to create a <code>DataFrame</code>. Here, we will cover the most popular approaches:</p>
+<ol type="1">
+<li>From a CSV file.</li>
+<li>Using a list and column name(s).</li>
+<li>From a dictionary.</li>
+<li>From a <code>Series</code>.</li>
+</ol>
+<p>More generally, the syntax for creating a <code>DataFrame</code> is:</p>
+<pre><code> pandas.DataFrame(data, index, columns)</code></pre>
+<section id="from-a-csv-file" class="level5" data-number="2.2.2.1.1">
+<h5 data-number="2.2.2.1.1" class="anchored" data-anchor-id="from-a-csv-file"><span class="header-section-number">2.2.2.1.1</span> From a CSV file</h5>
+<p>In Data 100, our data are typically stored in a CSV (comma-separated values) file format. We can import a CSV file into a <code>DataFrame</code> by passing the data path as an argument to the following <code>pandas</code> function. <br>  <code>pd.read_csv("filename.csv")</code></p>
+<p>With our new understanding of <code>pandas</code> in hand, let’s return to the <code>elections</code> dataset from before. Now, we can recognize that it is represented as a <code>pandas</code> <code>DataFrame</code>.</p>
+<div id="a606bbea" class="cell" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>elections <span class="op">=</span> pd.read_csv(<span class="st">"data/elections.csv"</span>)</span>
+<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>elections</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>702735</td>
+<td>win</td>
+<td>54.574789</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">177</td>
+<td>2016</td>
+<td>Jill Stein</td>
+<td>Green</td>
+<td>1457226</td>
+<td>loss</td>
+<td>1.073699</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">178</td>
+<td>2020</td>
+<td>Joseph Biden</td>
+<td>Democratic</td>
+<td>81268924</td>
+<td>win</td>
+<td>51.311515</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">179</td>
+<td>2020</td>
+<td>Donald Trump</td>
+<td>Republican</td>
+<td>74216154</td>
+<td>loss</td>
+<td>46.858542</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">180</td>
+<td>2020</td>
+<td>Jo Jorgensen</td>
+<td>Libertarian</td>
+<td>1865724</td>
+<td>loss</td>
+<td>1.177979</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">181</td>
+<td>2020</td>
+<td>Howard Hawkins</td>
+<td>Green</td>
+<td>405035</td>
+<td>loss</td>
+<td>0.255731</td>
+</tr>
+</tbody>
+</table>
+
+<p>182 rows × 6 columns</p>
+</div>
+</div>
+</div>
+<p>This code stores our <code>DataFrame</code> object in the <code>elections</code> variable. Upon inspection, our <code>elections</code> <code>DataFrame</code> has 182 rows and 6 columns (<code>Year</code>, <code>Candidate</code>, <code>Party</code>, <code>Popular Vote</code>, <code>Result</code>, <code>%</code>). Each row represents a single record — in our example, a presidential candidate from some particular year. Each column represents a single attribute or feature of the record.</p>
+</section>
+<section id="using-a-list-and-column-names" class="level5" data-number="2.2.2.1.2">
+<h5 data-number="2.2.2.1.2" class="anchored" data-anchor-id="using-a-list-and-column-names"><span class="header-section-number">2.2.2.1.2</span> Using a List and Column Name(s)</h5>
+<p>We’ll now explore creating a <code>DataFrame</code> with data of our own.</p>
+<p>Consider the following examples. The first code cell creates a <code>DataFrame</code> with a single column <code>Numbers</code>.</p>
+<div id="852d28e7" class="cell" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a>df_list <span class="op">=</span> pd.DataFrame([<span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>], columns<span class="op">=</span>[<span class="st">"Numbers"</span>])</span>
+<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>df_list</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="15">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Numbers</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>2</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>3</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>The second creates a <code>DataFrame</code> with the columns <code>Numbers</code> and <code>Description</code>. Notice how a 2D list of values is required to initialize the second <code>DataFrame</code> — each nested list represents a single row of data.</p>
+<div id="e47f7aca" class="cell" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a>df_list <span class="op">=</span> pd.DataFrame([[<span class="dv">1</span>, <span class="st">"one"</span>], [<span class="dv">2</span>, <span class="st">"two"</span>]], columns <span class="op">=</span> [<span class="st">"Number"</span>, <span class="st">"Description"</span>])</span>
+<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a>df_list</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="16">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Number</th>
+<th data-quarto-table-cell-role="th">Description</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1</td>
+<td>one</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>2</td>
+<td>two</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="from-a-dictionary" class="level5" data-number="2.2.2.1.3">
+<h5 data-number="2.2.2.1.3" class="anchored" data-anchor-id="from-a-dictionary"><span class="header-section-number">2.2.2.1.3</span> From a Dictionary</h5>
+<p>A third (and more common) way to create a <code>DataFrame</code> is with a dictionary. The dictionary keys represent the column names, and the dictionary values represent the column values.</p>
+<p>Below are two ways of implementing this approach. The first is based on specifying the columns of the <code>DataFrame</code>, whereas the second is based on specifying the rows of the <code>DataFrame</code>.</p>
+<div id="1897bb57" class="cell" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a>df_dict <span class="op">=</span> pd.DataFrame({</span>
+<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Fruit"</span>: [<span class="st">"Strawberry"</span>, <span class="st">"Orange"</span>], </span>
+<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Price"</span>: [<span class="fl">5.49</span>, <span class="fl">3.99</span>]</span>
+<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a>})</span>
+<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a>df_dict</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="17">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Fruit</th>
+<th data-quarto-table-cell-role="th">Price</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Strawberry</td>
+<td>5.49</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Orange</td>
+<td>3.99</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="6cd06fae" class="cell" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a>df_dict <span class="op">=</span> pd.DataFrame(</span>
+<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a>    [</span>
+<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a>        {<span class="st">"Fruit"</span>:<span class="st">"Strawberry"</span>, <span class="st">"Price"</span>:<span class="fl">5.49</span>}, </span>
+<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a>        {<span class="st">"Fruit"</span>: <span class="st">"Orange"</span>, <span class="st">"Price"</span>:<span class="fl">3.99</span>}</span>
+<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a>    ]</span>
+<span id="cb31-6"><a href="#cb31-6" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb31-7"><a href="#cb31-7" aria-hidden="true" tabindex="-1"></a>df_dict</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="18">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Fruit</th>
+<th data-quarto-table-cell-role="th">Price</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Strawberry</td>
+<td>5.49</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Orange</td>
+<td>3.99</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="from-a-series" class="level5" data-number="2.2.2.1.4">
+<h5 data-number="2.2.2.1.4" class="anchored" data-anchor-id="from-a-series"><span class="header-section-number">2.2.2.1.4</span> From a <code>Series</code></h5>
+<p>Earlier, we explained how a <code>Series</code> was synonymous to a column in a <code>DataFrame</code>. It follows, then, that a <code>DataFrame</code> is equivalent to a collection of <code>Series</code>, which all share the same <code>Index</code>.</p>
+<p>In fact, we can initialize a <code>DataFrame</code> by merging two or more <code>Series</code>. Consider the <code>Series</code> <code>s_a</code> and <code>s_b</code>.</p>
+<div id="9e582efd" class="cell" data-execution_count="19">
+<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Notice how our indices, or row labels, are the same</span></span>
+<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a>s_a <span class="op">=</span> pd.Series([<span class="st">"a1"</span>, <span class="st">"a2"</span>, <span class="st">"a3"</span>], index <span class="op">=</span> [<span class="st">"r1"</span>, <span class="st">"r2"</span>, <span class="st">"r3"</span>])</span>
+<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a>s_b <span class="op">=</span> pd.Series([<span class="st">"b1"</span>, <span class="st">"b2"</span>, <span class="st">"b3"</span>], index <span class="op">=</span> [<span class="st">"r1"</span>, <span class="st">"r2"</span>, <span class="st">"r3"</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We can turn individual <code>Series</code> into a <code>DataFrame</code> using two common methods (shown below):</p>
+<div id="d5b6fdba" class="cell" data-execution_count="20">
+<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a>pd.DataFrame(s_a)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="20">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">r1</td>
+<td>a1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">r2</td>
+<td>a2</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">r3</td>
+<td>a3</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="2f768126" class="cell" data-execution_count="21">
+<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a>s_b.to_frame()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="21">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">r1</td>
+<td>b1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">r2</td>
+<td>b2</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">r3</td>
+<td>b3</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>To merge the two <code>Series</code> and specify their column names, we use the following syntax:</p>
+<div id="a504dc4d" class="cell" data-execution_count="22">
+<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({</span>
+<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a>    <span class="st">"A-column"</span>: s_a, </span>
+<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a>    <span class="st">"B-column"</span>: s_b</span>
+<span id="cb35-4"><a href="#cb35-4" aria-hidden="true" tabindex="-1"></a>})</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="22">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">A-column</th>
+<th data-quarto-table-cell-role="th">B-column</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">r1</td>
+<td>a1</td>
+<td>b1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">r2</td>
+<td>a2</td>
+<td>b2</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">r3</td>
+<td>a3</td>
+<td>b3</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="indices" class="level3" data-number="2.2.3">
+<h3 data-number="2.2.3" class="anchored" data-anchor-id="indices"><span class="header-section-number">2.2.3</span> Indices</h3>
+<p>On a more technical note, an index doesn’t have to be an integer, nor does it have to be unique. For example, we can set the index of the <code>elections</code> <code>DataFrame</code> to be the name of presidential candidates.</p>
+<div id="3de138e2" class="cell" data-execution_count="23">
+<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Creating a DataFrame from a CSV file and specifying the index column</span></span>
+<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>elections <span class="op">=</span> pd.read_csv(<span class="st">"data/elections.csv"</span>, index_col <span class="op">=</span> <span class="st">"Candidate"</span>)</span>
+<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a>elections</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="23">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Andrew Jackson</td>
+<td>1824</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">John Quincy Adams</td>
+<td>1824</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Andrew Jackson</td>
+<td>1828</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">John Quincy Adams</td>
+<td>1828</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Andrew Jackson</td>
+<td>1832</td>
+<td>Democratic</td>
+<td>702735</td>
+<td>win</td>
+<td>54.574789</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Jill Stein</td>
+<td>2016</td>
+<td>Green</td>
+<td>1457226</td>
+<td>loss</td>
+<td>1.073699</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Joseph Biden</td>
+<td>2020</td>
+<td>Democratic</td>
+<td>81268924</td>
+<td>win</td>
+<td>51.311515</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Donald Trump</td>
+<td>2020</td>
+<td>Republican</td>
+<td>74216154</td>
+<td>loss</td>
+<td>46.858542</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Jo Jorgensen</td>
+<td>2020</td>
+<td>Libertarian</td>
+<td>1865724</td>
+<td>loss</td>
+<td>1.177979</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Howard Hawkins</td>
+<td>2020</td>
+<td>Green</td>
+<td>405035</td>
+<td>loss</td>
+<td>0.255731</td>
+</tr>
+</tbody>
+</table>
+
+<p>182 rows × 5 columns</p>
+</div>
+</div>
+</div>
+<p>We can also select a new column and set it as the index of the <code>DataFrame</code>. For example, we can set the index of the <code>elections</code> <code>DataFrame</code> to represent the candidate’s party.</p>
+<div id="ab8ac690" class="cell" data-execution_count="24">
+<div class="sourceCode cell-code" id="cb37"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a>elections.reset_index(inplace <span class="op">=</span> <span class="va">True</span>) <span class="co"># Resetting the index so we can set it again</span></span>
+<span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a><span class="co"># This sets the index to the "Party" column</span></span>
+<span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a>elections.set_index(<span class="st">"Party"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="24">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Democratic-Republican</td>
+<td>Andrew Jackson</td>
+<td>1824</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Democratic-Republican</td>
+<td>John Quincy Adams</td>
+<td>1824</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Democratic</td>
+<td>Andrew Jackson</td>
+<td>1828</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">National Republican</td>
+<td>John Quincy Adams</td>
+<td>1828</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Democratic</td>
+<td>Andrew Jackson</td>
+<td>1832</td>
+<td>702735</td>
+<td>win</td>
+<td>54.574789</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Green</td>
+<td>Jill Stein</td>
+<td>2016</td>
+<td>1457226</td>
+<td>loss</td>
+<td>1.073699</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Democratic</td>
+<td>Joseph Biden</td>
+<td>2020</td>
+<td>81268924</td>
+<td>win</td>
+<td>51.311515</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Republican</td>
+<td>Donald Trump</td>
+<td>2020</td>
+<td>74216154</td>
+<td>loss</td>
+<td>46.858542</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Libertarian</td>
+<td>Jo Jorgensen</td>
+<td>2020</td>
+<td>1865724</td>
+<td>loss</td>
+<td>1.177979</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Green</td>
+<td>Howard Hawkins</td>
+<td>2020</td>
+<td>405035</td>
+<td>loss</td>
+<td>0.255731</td>
+</tr>
+</tbody>
+</table>
+
+<p>182 rows × 5 columns</p>
+</div>
+</div>
+</div>
+<p>And, if we’d like, we can revert the index back to the default list of integers.</p>
+<div id="c5371d59" class="cell" data-execution_count="25">
+<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="co"># This resets the index to be the default list of integer</span></span>
+<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a>elections.reset_index(inplace<span class="op">=</span><span class="va">True</span>) </span>
+<span id="cb38-3"><a href="#cb38-3" aria-hidden="true" tabindex="-1"></a>elections.index</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="25">
+<pre><code>RangeIndex(start=0, stop=182, step=1)</code></pre>
+</div>
+</div>
+<p>It is also important to note that the row labels that constitute an index don’t have to be unique. While index values can be unique and numeric, acting as a row number, they can also be named and non-unique.</p>
+<p>Here we see unique and numeric index values.</p>
+<center>
+<img src="images/uniqueindex.png" width="400">
+</center>
+<p>However, here the index values are not unique.</p>
+<center>
+<img src="images/non-uniqueindex.png" width="400">
+</center>
+</section>
+</section>
+<section id="dataframe-attributes-index-columns-and-shape" class="level2" data-number="2.3">
+<h2 data-number="2.3" class="anchored" data-anchor-id="dataframe-attributes-index-columns-and-shape"><span class="header-section-number">2.3</span> <code>DataFrame</code> Attributes: Index, Columns, and Shape</h2>
+<p>On the other hand, column names in a <code>DataFrame</code> are almost always unique. Looking back to the <code>elections</code> dataset, it wouldn’t make sense to have two columns named <code>"Candidate"</code>. Sometimes, you’ll want to extract these different values, in particular, the list of row and column labels.</p>
+<p>For index/row labels, use <code>DataFrame.index</code>:</p>
+<div id="12e6b0d8" class="cell" data-execution_count="26">
+<div class="sourceCode cell-code" id="cb40"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a>elections.set_index(<span class="st">"Party"</span>, inplace <span class="op">=</span> <span class="va">True</span>)</span>
+<span id="cb40-2"><a href="#cb40-2" aria-hidden="true" tabindex="-1"></a>elections.index</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="26">
+<pre><code>Index(['Democratic-Republican', 'Democratic-Republican', 'Democratic',
+       'National Republican', 'Democratic', 'National Republican',
+       'Anti-Masonic', 'Whig', 'Democratic', 'Whig',
+       ...
+       'Constitution', 'Republican', 'Independent', 'Libertarian',
+       'Democratic', 'Green', 'Democratic', 'Republican', 'Libertarian',
+       'Green'],
+      dtype='object', name='Party', length=182)</code></pre>
+</div>
+</div>
+<p>For column labels, use <code>DataFrame.columns</code>:</p>
+<div id="ebbb5a76" class="cell" data-execution_count="27">
+<div class="sourceCode cell-code" id="cb42"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a>elections.columns</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="27">
+<pre><code>Index(['index', 'Candidate', 'Year', 'Popular vote', 'Result', '%'], dtype='object')</code></pre>
+</div>
+</div>
+<p>And for the shape of the <code>DataFrame</code>, we can use <code>DataFrame.shape</code> to get the number of rows followed by the number of columns:</p>
+<div id="f00f1446" class="cell" data-execution_count="28">
+<div class="sourceCode cell-code" id="cb44"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb44-1"><a href="#cb44-1" aria-hidden="true" tabindex="-1"></a>elections.shape</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="28">
+<pre><code>(182, 6)</code></pre>
+</div>
+</div>
+</section>
+<section id="slicing-in-dataframes" class="level2" data-number="2.4">
+<h2 data-number="2.4" class="anchored" data-anchor-id="slicing-in-dataframes"><span class="header-section-number">2.4</span> Slicing in <code>DataFrame</code>s</h2>
+<p>Now that we’ve learned more about <code>DataFrame</code>s, let’s dive deeper into their capabilities.</p>
+<p>The API (Application Programming Interface) for the <code>DataFrame</code> class is enormous. In this section, we’ll discuss several methods of the <code>DataFrame</code> API that allow us to extract subsets of data.</p>
+<p>The simplest way to manipulate a <code>DataFrame</code> is to extract a subset of rows and columns, known as <strong>slicing</strong>.</p>
+<p>Common ways we may want to extract data are grabbing:</p>
+<ul>
+<li>The first or last <code>n</code> rows in the <code>DataFrame</code>.</li>
+<li>Data with a certain label.</li>
+<li>Data at a certain position.</li>
+</ul>
+<p>We will do so with four primary methods of the <code>DataFrame</code> class:</p>
+<ol type="1">
+<li><code>.head</code> and <code>.tail</code></li>
+<li><code>.loc</code></li>
+<li><code>.iloc</code></li>
+<li><code>[]</code></li>
+</ol>
+<section id="extracting-data-with-.head-and-.tail" class="level3" data-number="2.4.1">
+<h3 data-number="2.4.1" class="anchored" data-anchor-id="extracting-data-with-.head-and-.tail"><span class="header-section-number">2.4.1</span> Extracting data with <code>.head</code> and <code>.tail</code></h3>
+<p>The simplest scenario in which we want to extract data is when we simply want to select the first or last few rows of the <code>DataFrame</code>.</p>
+<p>To extract the first <code>n</code> rows of a <code>DataFrame</code> <code>df</code>, we use the syntax <code>df.head(n)</code>.</p>
+<div id="a10cdd75" class="cell" data-execution_count="29">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb46"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a>elections <span class="op">=</span> pd.read_csv(<span class="st">"data/elections.csv"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="b45fd028" class="cell" data-execution_count="30">
+<div class="sourceCode cell-code" id="cb47"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Extract the first 5 rows of the DataFrame</span></span>
+<span id="cb47-2"><a href="#cb47-2" aria-hidden="true" tabindex="-1"></a>elections.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="30">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>702735</td>
+<td>win</td>
+<td>54.574789</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Similarly, calling <code>df.tail(n)</code> allows us to extract the last <code>n</code> rows of the <code>DataFrame</code>.</p>
+<div id="7077ba19" class="cell" data-execution_count="31">
+<div class="sourceCode cell-code" id="cb48"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb48-1"><a href="#cb48-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Extract the last 5 rows of the DataFrame</span></span>
+<span id="cb48-2"><a href="#cb48-2" aria-hidden="true" tabindex="-1"></a>elections.tail(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="31">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">177</td>
+<td>2016</td>
+<td>Jill Stein</td>
+<td>Green</td>
+<td>1457226</td>
+<td>loss</td>
+<td>1.073699</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">178</td>
+<td>2020</td>
+<td>Joseph Biden</td>
+<td>Democratic</td>
+<td>81268924</td>
+<td>win</td>
+<td>51.311515</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">179</td>
+<td>2020</td>
+<td>Donald Trump</td>
+<td>Republican</td>
+<td>74216154</td>
+<td>loss</td>
+<td>46.858542</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">180</td>
+<td>2020</td>
+<td>Jo Jorgensen</td>
+<td>Libertarian</td>
+<td>1865724</td>
+<td>loss</td>
+<td>1.177979</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">181</td>
+<td>2020</td>
+<td>Howard Hawkins</td>
+<td>Green</td>
+<td>405035</td>
+<td>loss</td>
+<td>0.255731</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="label-based-extraction-indexing-with-.loc" class="level3" data-number="2.4.2">
+<h3 data-number="2.4.2" class="anchored" data-anchor-id="label-based-extraction-indexing-with-.loc"><span class="header-section-number">2.4.2</span> Label-based Extraction: Indexing with <code>.loc</code></h3>
+<p>For the more complex task of extracting data with specific column or index labels, we can use <code>.loc</code>. The <code>.loc</code> accessor allows us to specify the <strong><em>labels</em></strong> of rows and columns we wish to extract. The <strong>labels</strong> (commonly referred to as the <strong>indices</strong>) are the bold text on the far <em>left</em> of a <code>DataFrame</code>, while the <strong>column labels</strong> are the column names found at the <em>top</em> of a <code>DataFrame</code>.</p>
+<center>
+<img src="images/locgraphic.png" width="800">
+</center>
+<p>To grab data with <code>.loc</code>, we must specify the row and column label(s) where the data exists. The row labels are the first argument to the <code>.loc</code> function; the column labels are the second.</p>
+<p>Arguments to <code>.loc</code> can be:</p>
+<ul>
+<li>A single value.</li>
+<li>A slice.</li>
+<li>A list.</li>
+</ul>
+<p>For example, to select a single value, we can select the row labeled <code>0</code> and the column labeled <code>Candidate</code> from the <code>elections</code> <code>DataFrame</code>.</p>
+<div id="3e21e302" class="cell" data-execution_count="32">
+<div class="sourceCode cell-code" id="cb49"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a>elections.loc[<span class="dv">0</span>, <span class="st">'Candidate'</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="32">
+<pre><code>'Andrew Jackson'</code></pre>
+</div>
+</div>
+<p>Keep in mind that passing in just one argument as a single value will produce a <code>Series</code>. Below, we’ve extracted a subset of the <code>"Popular vote"</code> column as a <code>Series</code>.</p>
+<div id="864aa865" class="cell" data-execution_count="33">
+<div class="sourceCode cell-code" id="cb51"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb51-1"><a href="#cb51-1" aria-hidden="true" tabindex="-1"></a>elections.loc[[<span class="dv">87</span>, <span class="dv">25</span>, <span class="dv">179</span>], <span class="st">"Popular vote"</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="33">
+<pre><code>87     15761254
+25       848019
+179    74216154
+Name: Popular vote, dtype: int64</code></pre>
+</div>
+</div>
+<p>To select <em>multiple</em> rows and columns, we can use Python slice notation. Here, we select the rows from labels <code>0</code> to <code>3</code> and the columns from labels <code>"Year"</code> to <code>"Popular vote"</code>. Notice that unlike Python slicing, <code>.loc</code> is <em>inclusive</em> of the right upper bound.</p>
+<div id="68712dc8" class="cell" data-execution_count="34">
+<div class="sourceCode cell-code" id="cb53"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb53-1"><a href="#cb53-1" aria-hidden="true" tabindex="-1"></a>elections.loc[<span class="dv">0</span>:<span class="dv">3</span>, <span class="st">'Year'</span>:<span class="st">'Popular vote'</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="34">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Suppose that instead, we want to extract <em>all</em> column values for the first four rows in the <code>elections</code> <code>DataFrame</code>. The shorthand <code>:</code> is useful for this.</p>
+<div id="fc7d4ab3" class="cell" data-execution_count="35">
+<div class="sourceCode cell-code" id="cb54"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a>elections.loc[<span class="dv">0</span>:<span class="dv">3</span>, :]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="35">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We can use the same shorthand to extract all rows.</p>
+<div id="9d204072" class="cell" data-execution_count="36">
+<div class="sourceCode cell-code" id="cb55"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb55-1"><a href="#cb55-1" aria-hidden="true" tabindex="-1"></a>elections.loc[:, [<span class="st">"Year"</span>, <span class="st">"Candidate"</span>, <span class="st">"Result"</span>]]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="36">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Result</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>loss</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>win</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>win</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>loss</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>win</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">177</td>
+<td>2016</td>
+<td>Jill Stein</td>
+<td>loss</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">178</td>
+<td>2020</td>
+<td>Joseph Biden</td>
+<td>win</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">179</td>
+<td>2020</td>
+<td>Donald Trump</td>
+<td>loss</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">180</td>
+<td>2020</td>
+<td>Jo Jorgensen</td>
+<td>loss</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">181</td>
+<td>2020</td>
+<td>Howard Hawkins</td>
+<td>loss</td>
+</tr>
+</tbody>
+</table>
+
+<p>182 rows × 3 columns</p>
+</div>
+</div>
+</div>
+<p>There are a couple of things we should note. Firstly, unlike conventional Python, <code>pandas</code> allows us to slice string values (in our example, the column labels). Secondly, slicing with <code>.loc</code> is <em>inclusive</em>. Notice how our resulting <code>DataFrame</code> includes every row and column between and including the slice labels we specified.</p>
+<p>Equivalently, we can use a list to obtain multiple rows and columns in our <code>elections</code> <code>DataFrame</code>.</p>
+<div id="b488759c" class="cell" data-execution_count="37">
+<div class="sourceCode cell-code" id="cb56"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a>elections.loc[[<span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>], [<span class="st">'Year'</span>, <span class="st">'Candidate'</span>, <span class="st">'Party'</span>, <span class="st">'Popular vote'</span>]]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="37">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Lastly, we can interchange list and slicing notation.</p>
+<div id="5e68745c" class="cell" data-execution_count="38">
+<div class="sourceCode cell-code" id="cb57"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb57-1"><a href="#cb57-1" aria-hidden="true" tabindex="-1"></a>elections.loc[[<span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>], :]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="38">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="integer-based-extraction-indexing-with-.iloc" class="level3" data-number="2.4.3">
+<h3 data-number="2.4.3" class="anchored" data-anchor-id="integer-based-extraction-indexing-with-.iloc"><span class="header-section-number">2.4.3</span> Integer-based Extraction: Indexing with <code>.iloc</code></h3>
+<p>Slicing with <code>.iloc</code> works similarly to <code>.loc</code>. However, <code>.iloc</code> uses the <em>index positions</em> of rows and columns rather than the labels (think to yourself: <strong>l</strong>oc uses <strong>l</strong>ables; <strong>i</strong>loc uses <strong>i</strong>ndices). The arguments to the <code>.iloc</code> function also behave similarly — single values, lists, indices, and any combination of these are permitted.</p>
+<p>Let’s begin reproducing our results from above. We’ll begin by selecting the first presidential candidate in our <code>elections</code> <code>DataFrame</code>:</p>
+<div id="cac24bb3" class="cell" data-execution_count="39">
+<div class="sourceCode cell-code" id="cb58"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb58-1"><a href="#cb58-1" aria-hidden="true" tabindex="-1"></a><span class="co"># elections.loc[0, "Candidate"] - Previous approach</span></span>
+<span id="cb58-2"><a href="#cb58-2" aria-hidden="true" tabindex="-1"></a>elections.iloc[<span class="dv">0</span>, <span class="dv">1</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="39">
+<pre><code>'Andrew Jackson'</code></pre>
+</div>
+</div>
+<p>Notice how the first argument to both <code>.loc</code> and <code>.iloc</code> are the same. This is because the row with a label of <code>0</code> is conveniently in the <span class="math inline">\(0^{\text{th}}\)</span> (equivalently, the first position) of the <code>elections</code> <code>DataFrame</code>. Generally, this is true of any <code>DataFrame</code> where the row labels are incremented in ascending order from 0.</p>
+<p>And, as before, if we were to pass in only one single value argument, our result would be a <code>Series</code>.</p>
+<div id="7d808090" class="cell" data-execution_count="40">
+<div class="sourceCode cell-code" id="cb60"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb60-1"><a href="#cb60-1" aria-hidden="true" tabindex="-1"></a>elections.iloc[[<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>],<span class="dv">1</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="40">
+<pre><code>1    John Quincy Adams
+2       Andrew Jackson
+3    John Quincy Adams
+Name: Candidate, dtype: object</code></pre>
+</div>
+</div>
+<p>However, when we select the first four rows and columns using <code>.iloc</code>, we notice something.</p>
+<div id="35c09089" class="cell" data-execution_count="41">
+<div class="sourceCode cell-code" id="cb62"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb62-1"><a href="#cb62-1" aria-hidden="true" tabindex="-1"></a><span class="co"># elections.loc[0:3, 'Year':'Popular vote'] - Previous approach</span></span>
+<span id="cb62-2"><a href="#cb62-2" aria-hidden="true" tabindex="-1"></a>elections.iloc[<span class="dv">0</span>:<span class="dv">4</span>, <span class="dv">0</span>:<span class="dv">4</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="41">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Slicing is no longer inclusive in <code>.iloc</code> — it’s <em>exclusive</em>. In other words, the right end of a slice is not included when using <code>.iloc</code>. This is one of the subtleties of <code>pandas</code> syntax; you will get used to it with practice.</p>
+<p>List behavior works just as expected.</p>
+<div id="85af7f54" class="cell" data-execution_count="42">
+<div class="sourceCode cell-code" id="cb63"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a><span class="co">#elections.loc[[0, 1, 2, 3], ['Year', 'Candidate', 'Party', 'Popular vote']] - Previous Approach</span></span>
+<span id="cb63-2"><a href="#cb63-2" aria-hidden="true" tabindex="-1"></a>elections.iloc[[<span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>], [<span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>]]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="42">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>And just like with <code>.loc</code>, we can use a colon with <code>.iloc</code> to extract all rows or columns.</p>
+<div id="fb933316" class="cell" data-execution_count="43">
+<div class="sourceCode cell-code" id="cb64"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb64-1"><a href="#cb64-1" aria-hidden="true" tabindex="-1"></a>elections.iloc[:, <span class="dv">0</span>:<span class="dv">3</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="43">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">177</td>
+<td>2016</td>
+<td>Jill Stein</td>
+<td>Green</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">178</td>
+<td>2020</td>
+<td>Joseph Biden</td>
+<td>Democratic</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">179</td>
+<td>2020</td>
+<td>Donald Trump</td>
+<td>Republican</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">180</td>
+<td>2020</td>
+<td>Jo Jorgensen</td>
+<td>Libertarian</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">181</td>
+<td>2020</td>
+<td>Howard Hawkins</td>
+<td>Green</td>
+</tr>
+</tbody>
+</table>
+
+<p>182 rows × 3 columns</p>
+</div>
+</div>
+</div>
+<p>This discussion begs the question: when should we use <code>.loc</code> vs.&nbsp;<code>.iloc</code>? In most cases, <code>.loc</code> is generally safer to use. You can imagine <code>.iloc</code> may return incorrect values when applied to a dataset where the ordering of data can change. However, <code>.iloc</code> can still be useful — for example, if you are looking at a <code>DataFrame</code> of sorted movie earnings and want to get the median earnings for a given year, you can use <code>.iloc</code> to index into the middle.</p>
+<p>Overall, it is important to remember that:</p>
+<ul>
+<li><code>.loc</code> performances <strong>l</strong>abel-based extraction.</li>
+<li><code>.iloc</code> performs <strong>i</strong>nteger-based extraction.</li>
+</ul>
+</section>
+<section id="context-dependent-extraction-indexing-with" class="level3" data-number="2.4.4">
+<h3 data-number="2.4.4" class="anchored" data-anchor-id="context-dependent-extraction-indexing-with"><span class="header-section-number">2.4.4</span> Context-dependent Extraction: Indexing with <code>[]</code></h3>
+<p>The <code>[]</code> selection operator is the most baffling of all, yet the most commonly used. It only takes a single argument, which may be one of the following:</p>
+<ol type="1">
+<li>A slice of row numbers.</li>
+<li>A list of column labels.</li>
+<li>A single-column label.</li>
+</ol>
+<p>That is, <code>[]</code> is <em>context-dependent</em>. Let’s see some examples.</p>
+<section id="a-slice-of-row-numbers" class="level4" data-number="2.4.4.1">
+<h4 data-number="2.4.4.1" class="anchored" data-anchor-id="a-slice-of-row-numbers"><span class="header-section-number">2.4.4.1</span> A slice of row numbers</h4>
+<p>Say we wanted the first four rows of our <code>elections</code> <code>DataFrame</code>.</p>
+<div id="7c91009c" class="cell" data-execution_count="44">
+<div class="sourceCode cell-code" id="cb65"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb65-1"><a href="#cb65-1" aria-hidden="true" tabindex="-1"></a>elections[<span class="dv">0</span>:<span class="dv">4</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="44">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="a-list-of-column-labels" class="level4" data-number="2.4.4.2">
+<h4 data-number="2.4.4.2" class="anchored" data-anchor-id="a-list-of-column-labels"><span class="header-section-number">2.4.4.2</span> A list of column labels</h4>
+<p>Suppose we now want the first four columns.</p>
+<div id="927372a5" class="cell" data-execution_count="45">
+<div class="sourceCode cell-code" id="cb66"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb66-1"><a href="#cb66-1" aria-hidden="true" tabindex="-1"></a>elections[[<span class="st">"Year"</span>, <span class="st">"Candidate"</span>, <span class="st">"Party"</span>, <span class="st">"Popular vote"</span>]]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="45">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>702735</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">177</td>
+<td>2016</td>
+<td>Jill Stein</td>
+<td>Green</td>
+<td>1457226</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">178</td>
+<td>2020</td>
+<td>Joseph Biden</td>
+<td>Democratic</td>
+<td>81268924</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">179</td>
+<td>2020</td>
+<td>Donald Trump</td>
+<td>Republican</td>
+<td>74216154</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">180</td>
+<td>2020</td>
+<td>Jo Jorgensen</td>
+<td>Libertarian</td>
+<td>1865724</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">181</td>
+<td>2020</td>
+<td>Howard Hawkins</td>
+<td>Green</td>
+<td>405035</td>
+</tr>
+</tbody>
+</table>
+
+<p>182 rows × 4 columns</p>
+</div>
+</div>
+</div>
+</section>
+<section id="a-single-column-label" class="level4" data-number="2.4.4.3">
+<h4 data-number="2.4.4.3" class="anchored" data-anchor-id="a-single-column-label"><span class="header-section-number">2.4.4.3</span> A single-column label</h4>
+<p>Lastly, <code>[]</code> allows us to extract only the <code>"Candidate"</code> column.</p>
+<div id="72e1ba96" class="cell" data-execution_count="46">
+<div class="sourceCode cell-code" id="cb67"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb67-1"><a href="#cb67-1" aria-hidden="true" tabindex="-1"></a>elections[<span class="st">"Candidate"</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="46">
+<pre><code>0         Andrew Jackson
+1      John Quincy Adams
+2         Andrew Jackson
+3      John Quincy Adams
+4         Andrew Jackson
+             ...        
+177           Jill Stein
+178         Joseph Biden
+179         Donald Trump
+180         Jo Jorgensen
+181       Howard Hawkins
+Name: Candidate, Length: 182, dtype: object</code></pre>
+</div>
+</div>
+<p>The output is a <code>Series</code>! In this course, we’ll become very comfortable with <code>[]</code>, especially for selecting columns. In practice, <code>[]</code> is much more common than <code>.loc</code>, especially since it is far more concise.</p>
+</section>
+</section>
+</section>
+<section id="parting-note" class="level2" data-number="2.5">
+<h2 data-number="2.5" class="anchored" data-anchor-id="parting-note"><span class="header-section-number">2.5</span> Parting Note</h2>
+<p>The <code>pandas</code> library is enormous and contains many useful functions. Here is a link to its <a href="https://pandas.pydata.org/docs/">documentation</a>. We certainly don’t expect you to memorize each and every method of the library, and we will give you a reference sheet for exams.</p>
+<p>The introductory Data 100 <code>pandas</code> lectures will provide a high-level view of the key data structures and methods that will form the foundation of your <code>pandas</code> knowledge. A goal of this course is to help you build your familiarity with the real-world programming practice of … Googling! Answers to your questions can be found in documentation, Stack Overflow, etc. Being able to search for, read, and implement documentation is an important life skill for any data scientist.</p>
+<p>With that, we will move on to Pandas II!</p>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../intro_lec/introduction.html" class="pagination-link" aria-label="Introduction">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../pandas_2/pandas_2.html" class="pagination-link" aria-label="Pandas II">
+        <span class="nav-page-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/pandas_2/pandas_2.html b/docs/pandas_2/pandas_2.html
new file mode 100644
index 000000000..4c70c8486
--- /dev/null
+++ b/docs/pandas_2/pandas_2.html
@@ -0,0 +1,2369 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>3&nbsp; Pandas II – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../pandas_3/pandas_3.html" rel="next">
+<link href="../pandas_1/pandas_1.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../pandas_2/pandas_2.html"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#conditional-selection" id="toc-conditional-selection" class="nav-link active" data-scroll-target="#conditional-selection"><span class="header-section-number">3.1</span> Conditional Selection</a></li>
+  <li><a href="#adding-removing-and-modifying-columns" id="toc-adding-removing-and-modifying-columns" class="nav-link" data-scroll-target="#adding-removing-and-modifying-columns"><span class="header-section-number">3.2</span> Adding, Removing, and Modifying Columns</a></li>
+  <li><a href="#useful-utility-functions" id="toc-useful-utility-functions" class="nav-link" data-scroll-target="#useful-utility-functions"><span class="header-section-number">3.3</span> Useful Utility Functions</a>
+  <ul>
+  <li><a href="#numpy" id="toc-numpy" class="nav-link" data-scroll-target="#numpy"><span class="header-section-number">3.3.1</span> <code>NumPy</code></a></li>
+  <li><a href="#shape-and-.size" id="toc-shape-and-.size" class="nav-link" data-scroll-target="#shape-and-.size"><span class="header-section-number">3.3.2</span> <code>.shape</code> and <code>.size</code></a></li>
+  <li><a href="#describe" id="toc-describe" class="nav-link" data-scroll-target="#describe"><span class="header-section-number">3.3.3</span> <code>.describe()</code></a></li>
+  <li><a href="#sample" id="toc-sample" class="nav-link" data-scroll-target="#sample"><span class="header-section-number">3.3.4</span> <code>.sample()</code></a></li>
+  <li><a href="#value_counts" id="toc-value_counts" class="nav-link" data-scroll-target="#value_counts"><span class="header-section-number">3.3.5</span> <code>.value_counts()</code></a></li>
+  <li><a href="#unique" id="toc-unique" class="nav-link" data-scroll-target="#unique"><span class="header-section-number">3.3.6</span> <code>.unique()</code></a></li>
+  <li><a href="#sort_values" id="toc-sort_values" class="nav-link" data-scroll-target="#sort_values"><span class="header-section-number">3.3.7</span> <code>.sort_values()</code></a></li>
+  </ul></li>
+  <li><a href="#parting-note" id="toc-parting-note" class="nav-link" data-scroll-target="#parting-note"><span class="header-section-number">3.4</span> Parting Note</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Continue building familiarity with <code>pandas</code> syntax.</li>
+<li>Extract data from a <code>DataFrame</code> using conditional selection.</li>
+<li>Recognize situations where aggregation is useful and identify the correct technique for performing an aggregation.</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Last time, we introduced the <code>pandas</code> library as a toolkit for processing data. We learned the <code>DataFrame</code> and <code>Series</code> data structures, familiarized ourselves with the basic syntax for manipulating tabular data, and began writing our first lines of <code>pandas</code> code.</p>
+<p>In this lecture, we’ll start to dive into some advanced <code>pandas</code> syntax. You may find it helpful to follow along with a notebook of your own as we walk through these new pieces of code.</p>
+<p>We’ll start by loading the <code>babynames</code> dataset.</p>
+<div id="afa65131" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># This code pulls census data and loads it into a DataFrame</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># We won't cover it explicitly in this class, but you are welcome to explore it on your own</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> urllib.request</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os.path</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> zipfile</span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a>data_url <span class="op">=</span> <span class="st">"https://www.ssa.gov/oact/babynames/state/namesbystate.zip"</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>local_filename <span class="op">=</span> <span class="st">"data/babynamesbystate.zip"</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> <span class="kw">not</span> os.path.exists(local_filename): <span class="co"># If the data exists don't download again</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> urllib.request.urlopen(data_url) <span class="im">as</span> resp, <span class="bu">open</span>(local_filename, <span class="st">'wb'</span>) <span class="im">as</span> f:</span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a>        f.write(resp.read())</span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a>zf <span class="op">=</span> zipfile.ZipFile(local_filename, <span class="st">'r'</span>)</span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a>ca_name <span class="op">=</span> <span class="st">'STATE.CA.TXT'</span></span>
+<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a>field_names <span class="op">=</span> [<span class="st">'State'</span>, <span class="st">'Sex'</span>, <span class="st">'Year'</span>, <span class="st">'Name'</span>, <span class="st">'Count'</span>]</span>
+<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> zf.<span class="bu">open</span>(ca_name) <span class="im">as</span> fh:</span>
+<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a>    babynames <span class="op">=</span> pd.read_csv(fh, header<span class="op">=</span><span class="va">None</span>, names<span class="op">=</span>field_names)</span>
+<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a>babynames.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="1">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<section id="conditional-selection" class="level2" data-number="3.1">
+<h2 data-number="3.1" class="anchored" data-anchor-id="conditional-selection"><span class="header-section-number">3.1</span> Conditional Selection</h2>
+<p>Conditional selection allows us to select a subset of rows in a <code>DataFrame</code> that satisfy some specified condition.</p>
+<p>To understand how to use conditional selection, we must look at another possible input of the <code>.loc</code> and <code>[]</code> methods – a boolean array, which is simply an array or <code>Series</code> where each element is either <code>True</code> or <code>False</code>. This boolean array must have a length equal to the number of rows in the <code>DataFrame</code>. It will return all rows that correspond to a value of <code>True</code> in the array. We used a very similar technique when performing conditional extraction from a <code>Series</code> in the last lecture.</p>
+<p>To see this in action, let’s select all even-indexed rows in the first 10 rows of our <code>DataFrame</code>.</p>
+<div id="4da79509" class="cell" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ask yourself: why is :9 is the correct slice to select the first 10 rows?</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>babynames_first_10_rows <span class="op">=</span> babynames.loc[:<span class="dv">9</span>, :]</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Notice how we have exactly 10 elements in our boolean array argument</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>babynames_first_10_rows[[<span class="va">True</span>, <span class="va">False</span>, <span class="va">True</span>, <span class="va">False</span>, <span class="va">True</span>, <span class="va">False</span>, <span class="va">True</span>, <span class="va">False</span>, <span class="va">True</span>, <span class="va">False</span>]]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="2">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">6</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Evelyn</td>
+<td>126</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">8</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Virginia</td>
+<td>101</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We can perform a similar operation using <code>.loc</code>.</p>
+<div id="ced2ea5a" class="cell" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>babynames_first_10_rows.loc[[<span class="va">True</span>, <span class="va">False</span>, <span class="va">True</span>, <span class="va">False</span>, <span class="va">True</span>, <span class="va">False</span>, <span class="va">True</span>, <span class="va">False</span>, <span class="va">True</span>, <span class="va">False</span>], :]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">6</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Evelyn</td>
+<td>126</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">8</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Virginia</td>
+<td>101</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>These techniques worked well in this example, but you can imagine how tedious it might be to list out <code>True</code> and <code>False</code>for every row in a larger <code>DataFrame</code>. To make things easier, we can instead provide a logical condition as an input to <code>.loc</code> or <code>[]</code> that returns a boolean array with the necessary length.</p>
+<p>For example, to return all names associated with <code>F</code> sex:</p>
+<div id="d392c24c" class="cell" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># First, use a logical condition to generate a boolean array</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>logical_operator <span class="op">=</span> (babynames[<span class="st">"Sex"</span>] <span class="op">==</span> <span class="st">"F"</span>)</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Then, use this boolean array to filter the DataFrame</span></span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>babynames[logical_operator].head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Recall from the previous lecture that <code>.head()</code> will return only the first few rows in the <code>DataFrame</code>. In reality, <code>babynames[logical operator]</code> contains as many rows as there are entries in the original <code>babynames</code> <code>DataFrame</code> with sex <code>"F"</code>.</p>
+<p>Here, <code>logical_operator</code> evaluates to a <code>Series</code> of boolean values with length 407428.</p>
+<div id="cd4fe8ee" class="cell" data-execution_count="5">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"There are a total of </span><span class="sc">{}</span><span class="st"> values in 'logical_operator'"</span>.<span class="bu">format</span>(<span class="bu">len</span>(logical_operator)))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>There are a total of 407428 values in 'logical_operator'</code></pre>
+</div>
+</div>
+<p>Rows starting at row 0 and ending at row 239536 evaluate to <code>True</code> and are thus returned in the <code>DataFrame</code>. Rows from 239537 onwards evaluate to <code>False</code> and are omitted from the output.</p>
+<div id="923370a3" class="cell" data-execution_count="6">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"The 0th item in this 'logical_operator' is: </span><span class="sc">{}</span><span class="st">"</span>.<span class="bu">format</span>(logical_operator.iloc[<span class="dv">0</span>]))</span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"The 239536th item in this 'logical_operator' is: </span><span class="sc">{}</span><span class="st">"</span>.<span class="bu">format</span>(logical_operator.iloc[<span class="dv">239536</span>]))</span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"The 239537th item in this 'logical_operator' is: </span><span class="sc">{}</span><span class="st">"</span>.<span class="bu">format</span>(logical_operator.iloc[<span class="dv">239537</span>]))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>The 0th item in this 'logical_operator' is: True
+The 239536th item in this 'logical_operator' is: True
+The 239537th item in this 'logical_operator' is: False</code></pre>
+</div>
+</div>
+<p>Passing a <code>Series</code> as an argument to <code>babynames[]</code> has the same effect as using a boolean array. In fact, the <code>[]</code> selection operator can take a boolean <code>Series</code>, array, and list as arguments. These three are used interchangeably throughout the course.</p>
+<p>We can also use <code>.loc</code> to achieve similar results.</p>
+<div id="7b366de6" class="cell" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>babynames.loc[babynames[<span class="st">"Sex"</span>] <span class="op">==</span> <span class="st">"F"</span>].head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Boolean conditions can be combined using various bitwise operators, allowing us to filter results by multiple conditions. In the table below, p and q are boolean arrays or <code>Series</code>.</p>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th>Symbol</th>
+<th>Usage</th>
+<th>Meaning</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>~</td>
+<td>~p</td>
+<td>Returns negation of p</td>
+</tr>
+<tr class="even">
+<td>|</td>
+<td>p | q</td>
+<td>p OR q</td>
+</tr>
+<tr class="odd">
+<td>&amp;</td>
+<td>p &amp; q</td>
+<td>p AND q</td>
+</tr>
+<tr class="even">
+<td>^</td>
+<td>p ^ q</td>
+<td>p XOR q (exclusive or)</td>
+</tr>
+</tbody>
+</table>
+<p>When combining multiple conditions with logical operators, we surround each individual condition with a set of parenthesis <code>()</code>. This imposes an order of operations on <code>pandas</code> evaluating your logic and can avoid code erroring.</p>
+<p>For example, if we want to return data on all names with sex <code>"F"</code> born before the year 2000, we can write:</p>
+<div id="3f2f42de" class="cell" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>babynames[(babynames[<span class="st">"Sex"</span>] <span class="op">==</span> <span class="st">"F"</span>) <span class="op">&amp;</span> (babynames[<span class="st">"Year"</span>] <span class="op">&lt;</span> <span class="dv">2000</span>)].head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Note that we’re working with <code>Series</code>, so using <code>and</code> in place of <code>&amp;</code>, or <code>or</code> in place <code>|</code> will error.</p>
+<div id="2334970b" class="cell" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co"># This line of code will raise a ValueError</span></span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="co"># babynames[(babynames["Sex"] == "F") and (babynames["Year"] &lt; 2000)].head()</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>If we want to return data on all names with sex <code>"F"</code> <em>or</em> all born before the year 2000, we can write:</p>
+<div id="a1c9cc6b" class="cell" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>babynames[(babynames[<span class="st">"Sex"</span>] <span class="op">==</span> <span class="st">"F"</span>) <span class="op">|</span> (babynames[<span class="st">"Year"</span>] <span class="op">&lt;</span> <span class="dv">2000</span>)].head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="10">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Boolean array selection is a useful tool, but can lead to overly verbose code for complex conditions. In the example below, our boolean condition is long enough to extend for several lines of code.</p>
+<div id="a7d93327" class="cell" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Note: The parentheses surrounding the code make it possible to break the code on to multiple lines for readability</span></span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>(</span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>    babynames[(babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Bella"</span>) <span class="op">|</span> </span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>              (babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Alex"</span>) <span class="op">|</span></span>
+<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a>              (babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Ani"</span>) <span class="op">|</span></span>
+<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a>              (babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Lisa"</span>)]</span>
+<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a>).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="11">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">6289</td>
+<td>CA</td>
+<td>F</td>
+<td>1923</td>
+<td>Bella</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">7512</td>
+<td>CA</td>
+<td>F</td>
+<td>1925</td>
+<td>Bella</td>
+<td>8</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">12368</td>
+<td>CA</td>
+<td>F</td>
+<td>1932</td>
+<td>Lisa</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">14741</td>
+<td>CA</td>
+<td>F</td>
+<td>1936</td>
+<td>Lisa</td>
+<td>8</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">17084</td>
+<td>CA</td>
+<td>F</td>
+<td>1939</td>
+<td>Lisa</td>
+<td>5</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Fortunately, <code>pandas</code> provides many alternative methods for constructing boolean filters.</p>
+<p>The <code>.isin</code> function is one such example. This method evaluates if the values in a <code>Series</code> are contained in a different sequence (list, array, or <code>Series</code>) of values. In the cell below, we achieve equivalent results to the <code>DataFrame</code> above with far more concise code.</p>
+<div id="64546e43" class="cell" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>names <span class="op">=</span> [<span class="st">"Bella"</span>, <span class="st">"Alex"</span>, <span class="st">"Narges"</span>, <span class="st">"Lisa"</span>]</span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"Name"</span>].isin(names).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="12">
+<pre><code>0    False
+1    False
+2    False
+3    False
+4    False
+Name: Name, dtype: bool</code></pre>
+</div>
+</div>
+<div id="786c5202" class="cell" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>babynames[babynames[<span class="st">"Name"</span>].isin(names)].head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="13">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">6289</td>
+<td>CA</td>
+<td>F</td>
+<td>1923</td>
+<td>Bella</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">7512</td>
+<td>CA</td>
+<td>F</td>
+<td>1925</td>
+<td>Bella</td>
+<td>8</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">12368</td>
+<td>CA</td>
+<td>F</td>
+<td>1932</td>
+<td>Lisa</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">14741</td>
+<td>CA</td>
+<td>F</td>
+<td>1936</td>
+<td>Lisa</td>
+<td>8</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">17084</td>
+<td>CA</td>
+<td>F</td>
+<td>1939</td>
+<td>Lisa</td>
+<td>5</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>The function <code>str.startswith</code> can be used to define a filter based on string values in a <code>Series</code> object. It checks to see if string values in a <code>Series</code> start with a particular character.</p>
+<div id="37e19bdb" class="cell" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Identify whether names begin with the letter "N"</span></span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"Name"</span>].<span class="bu">str</span>.startswith(<span class="st">"N"</span>).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<pre><code>0    False
+1    False
+2    False
+3    False
+4    False
+Name: Name, dtype: bool</code></pre>
+</div>
+</div>
+<div id="2c011dd3" class="cell" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Extracting names that begin with the letter "N"</span></span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>babynames[babynames[<span class="st">"Name"</span>].<span class="bu">str</span>.startswith(<span class="st">"N"</span>)].head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="15">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">76</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Norma</td>
+<td>23</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">83</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Nellie</td>
+<td>20</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">127</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Nina</td>
+<td>11</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">198</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Nora</td>
+<td>6</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">310</td>
+<td>CA</td>
+<td>F</td>
+<td>1911</td>
+<td>Nellie</td>
+<td>23</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="adding-removing-and-modifying-columns" class="level2" data-number="3.2">
+<h2 data-number="3.2" class="anchored" data-anchor-id="adding-removing-and-modifying-columns"><span class="header-section-number">3.2</span> Adding, Removing, and Modifying Columns</h2>
+<p>In many data science tasks, we may need to change the columns contained in our <code>DataFrame</code> in some way. Fortunately, the syntax to do so is fairly straightforward.</p>
+<p>To add a new column to a <code>DataFrame</code>, we use a syntax similar to that used when accessing an existing column. Specify the name of the new column by writing <code>df["column"]</code>, then assign this to a <code>Series</code> or array containing the values that will populate this column.</p>
+<div id="66087634" class="cell" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a Series of the length of each name. </span></span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>babyname_lengths <span class="op">=</span> babynames[<span class="st">"Name"</span>].<span class="bu">str</span>.<span class="bu">len</span>()</span>
+<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Add a column named "name_lengths" that includes the length of each name</span></span>
+<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"name_lengths"</span>] <span class="op">=</span> babyname_lengths</span>
+<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="16">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+<th data-quarto-table-cell-role="th">name_lengths</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+<td>4</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+<td>7</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+<td>8</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+<td>7</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>If we need to later modify an existing column, we can do so by referencing this column again with the syntax <code>df["column"]</code>, then re-assigning it to a new <code>Series</code> or array of the appropriate length.</p>
+<div id="0467ae45" class="cell" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Modify the “name_lengths” column to be one less than its original value</span></span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"name_lengths"</span>] <span class="op">=</span> babynames[<span class="st">"name_lengths"</span>] <span class="op">-</span> <span class="dv">1</span></span>
+<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>babynames.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="17">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+<th data-quarto-table-cell-role="th">name_lengths</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+<td>3</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+<td>4</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+<td>6</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+<td>7</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+<td>6</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We can rename a column using the <code>.rename()</code> method. It takes in a dictionary that maps old column names to their new ones.</p>
+<div id="3b37f6cb" class="cell" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Rename “name_lengths” to “Length”</span></span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.rename(columns<span class="op">=</span>{<span class="st">"name_lengths"</span>:<span class="st">"Length"</span>})</span>
+<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a>babynames.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="18">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+<th data-quarto-table-cell-role="th">Length</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+<td>3</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+<td>4</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+<td>6</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+<td>7</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+<td>6</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>If we want to remove a column or row of a <code>DataFrame</code>, we can call the <code>.drop</code> <a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html">(documentation)</a> method. Use the <code>axis</code> parameter to specify whether a column or row should be dropped. Unless otherwise specified, <code>pandas</code> will assume that we are dropping a row by default.</p>
+<div id="dee0fa14" class="cell" data-execution_count="19">
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Drop our new "Length" column from the DataFrame</span></span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.drop(<span class="st">"Length"</span>, axis<span class="op">=</span><span class="st">"columns"</span>)</span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="19">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Notice that we <em>re-assigned</em> <code>babynames</code> to the result of <code>babynames.drop(...)</code>. This is a subtle but important point: <code>pandas</code> table operations <strong>do not occur in-place</strong>. Calling <code>df.drop(...)</code> will output a <em>copy</em> of <code>df</code> with the row/column of interest removed without modifying the original <code>df</code> table.</p>
+<p>In other words, if we simply call:</p>
+<div id="5fed1d10" class="cell" data-execution_count="20">
+<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># This creates a copy of `babynames` and removes the column "Name"...</span></span>
+<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>babynames.drop(<span class="st">"Name"</span>, axis<span class="op">=</span><span class="st">"columns"</span>)</span>
+<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a><span class="co"># ...but the original `babynames` is unchanged! </span></span>
+<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Notice that the "Name" column is still present</span></span>
+<span id="cb24-6"><a href="#cb24-6" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="20">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="useful-utility-functions" class="level2" data-number="3.3">
+<h2 data-number="3.3" class="anchored" data-anchor-id="useful-utility-functions"><span class="header-section-number">3.3</span> Useful Utility Functions</h2>
+<p><code>pandas</code> contains an extensive library of functions that can help shorten the process of setting and getting information from its data structures. In the following section, we will give overviews of each of the main utility functions that will help us in Data 100.</p>
+<p>Discussing all functionality offered by <code>pandas</code> could take an entire semester! We will walk you through the most commonly-used functions and encourage you to explore and experiment on your own.</p>
+<ul>
+<li><code>NumPy</code> and built-in function support</li>
+<li><code>.shape</code></li>
+<li><code>.size</code></li>
+<li><code>.describe()</code></li>
+<li><code>.sample()</code></li>
+<li><code>.value_counts()</code></li>
+<li><code>.unique()</code></li>
+<li><code>.sort_values()</code></li>
+</ul>
+<p>The <code>pandas</code> <a href="https://pandas.pydata.org/docs/reference/index.html">documentation</a> will be a valuable resource in Data 100 and beyond.</p>
+<section id="numpy" class="level3" data-number="3.3.1">
+<h3 data-number="3.3.1" class="anchored" data-anchor-id="numpy"><span class="header-section-number">3.3.1</span> <code>NumPy</code></h3>
+<p><code>pandas</code> is designed to work well with <code>NumPy</code>, the framework for array computations you encountered in <a href="https://www.data8.org/su23/reference/#array-functions-and-methods">Data 8</a>. Just about any <code>NumPy</code> function can be applied to <code>pandas</code> <code>DataFrame</code>s and <code>Series</code>.</p>
+<div id="e60666c1" class="cell" data-execution_count="21">
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Pull out the number of babies named Yash each year</span></span>
+<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a>yash_count <span class="op">=</span> babynames[babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Yash"</span>][<span class="st">"Count"</span>]</span>
+<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a>yash_count.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="21">
+<pre><code>331824     8
+334114     9
+336390    11
+338773    12
+341387    10
+Name: Count, dtype: int64</code></pre>
+</div>
+</div>
+<div id="6a5d640d" class="cell" data-execution_count="22">
+<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Average number of babies named Yash each year</span></span>
+<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>np.mean(yash_count)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="22">
+<pre><code>np.float64(17.142857142857142)</code></pre>
+</div>
+</div>
+<div id="da4cfab9" class="cell" data-execution_count="23">
+<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Max number of babies named Yash born in any one year</span></span>
+<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a>np.<span class="bu">max</span>(yash_count)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="23">
+<pre><code>np.int64(29)</code></pre>
+</div>
+</div>
+</section>
+<section id="shape-and-.size" class="level3" data-number="3.3.2">
+<h3 data-number="3.3.2" class="anchored" data-anchor-id="shape-and-.size"><span class="header-section-number">3.3.2</span> <code>.shape</code> and <code>.size</code></h3>
+<p><code>.shape</code> and <code>.size</code> are attributes of <code>Series</code> and <code>DataFrame</code>s that measure the “amount” of data stored in the structure. Calling <code>.shape</code> returns a tuple containing the number of rows and columns present in the <code>DataFrame</code> or <code>Series</code>. <code>.size</code> is used to find the total number of elements in a structure, equivalent to the number of rows times the number of columns.</p>
+<p>Many functions strictly require the dimensions of the arguments along certain axes to match. Calling these dimension-finding functions is much faster than counting all of the items by hand.</p>
+<div id="3b90ca62" class="cell" data-execution_count="24">
+<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Return the shape of the DataFrame, in the format (num_rows, num_columns)</span></span>
+<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a>babynames.shape</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="24">
+<pre><code>(407428, 5)</code></pre>
+</div>
+</div>
+<div id="ccb39079" class="cell" data-execution_count="25">
+<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Return the size of the DataFrame, equal to num_rows * num_columns</span></span>
+<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a>babynames.size</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="25">
+<pre><code>2037140</code></pre>
+</div>
+</div>
+</section>
+<section id="describe" class="level3" data-number="3.3.3">
+<h3 data-number="3.3.3" class="anchored" data-anchor-id="describe"><span class="header-section-number">3.3.3</span> <code>.describe()</code></h3>
+<p>If many statistics are required from a <code>DataFrame</code> (minimum value, maximum value, mean value, etc.), then <code>.describe()</code> <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html">(documentation)</a> can be used to compute all of them at once.</p>
+<div id="dd9e7930" class="cell" data-execution_count="26">
+<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>babynames.describe()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="26">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">count</td>
+<td>407428.000000</td>
+<td>407428.000000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">mean</td>
+<td>1985.733609</td>
+<td>79.543456</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">std</td>
+<td>27.007660</td>
+<td>293.698654</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">min</td>
+<td>1910.000000</td>
+<td>5.000000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">25%</td>
+<td>1969.000000</td>
+<td>7.000000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">50%</td>
+<td>1992.000000</td>
+<td>13.000000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">75%</td>
+<td>2008.000000</td>
+<td>38.000000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">max</td>
+<td>2022.000000</td>
+<td>8260.000000</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>A different set of statistics will be reported if <code>.describe()</code> is called on a <code>Series</code>.</p>
+<div id="4620ad5b" class="cell" data-execution_count="27">
+<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"Sex"</span>].describe()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="27">
+<pre><code>count     407428
+unique         2
+top            F
+freq      239537
+Name: Sex, dtype: object</code></pre>
+</div>
+</div>
+</section>
+<section id="sample" class="level3" data-number="3.3.4">
+<h3 data-number="3.3.4" class="anchored" data-anchor-id="sample"><span class="header-section-number">3.3.4</span> <code>.sample()</code></h3>
+<p>As we will see later in the semester, random processes are at the heart of many data science techniques (for example, train-test splits, bootstrapping, and cross-validation). <code>.sample()</code> <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html">(documentation)</a> lets us quickly select random entries (a row if called from a <code>DataFrame</code>, or a value if called from a <code>Series</code>).</p>
+<p>By default, <code>.sample()</code> selects entries <em>without</em> replacement. Pass in the argument <code>replace=True</code> to sample with replacement.</p>
+<div id="e1c959cd" class="cell" data-execution_count="28">
+<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Sample a single row</span></span>
+<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a>babynames.sample()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="28">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">28158</td>
+<td>CA</td>
+<td>F</td>
+<td>1950</td>
+<td>Vikki</td>
+<td>14</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Naturally, this can be chained with other methods and operators (<code>iloc</code>, etc.).</p>
+<div id="b04de187" class="cell" data-execution_count="29">
+<div class="sourceCode cell-code" id="cb39"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Sample 5 random rows, and select all columns after column 2</span></span>
+<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a>babynames.sample(<span class="dv">5</span>).iloc[:, <span class="dv">2</span>:]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="29">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">82058</td>
+<td>1979</td>
+<td>Lakesha</td>
+<td>11</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">387687</td>
+<td>2016</td>
+<td>Zayn</td>
+<td>101</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">105977</td>
+<td>1988</td>
+<td>Cecilia</td>
+<td>213</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">75257</td>
+<td>1976</td>
+<td>Clarice</td>
+<td>7</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">7685</td>
+<td>1925</td>
+<td>Elia</td>
+<td>5</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="de6e4bd5" class="cell" data-execution_count="30">
+<div class="sourceCode cell-code" id="cb40"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Randomly sample 4 names from the year 2000, with replacement, and select all columns after column 2</span></span>
+<span id="cb40-2"><a href="#cb40-2" aria-hidden="true" tabindex="-1"></a>babynames[babynames[<span class="st">"Year"</span>] <span class="op">==</span> <span class="dv">2000</span>].sample(<span class="dv">4</span>, replace <span class="op">=</span> <span class="va">True</span>).iloc[:, <span class="dv">2</span>:]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="30">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">342973</td>
+<td>2000</td>
+<td>Grayson</td>
+<td>46</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">151608</td>
+<td>2000</td>
+<td>Roshni</td>
+<td>8</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">343172</td>
+<td>2000</td>
+<td>Dwayne</td>
+<td>27</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">343039</td>
+<td>2000</td>
+<td>Jair</td>
+<td>38</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="value_counts" class="level3" data-number="3.3.5">
+<h3 data-number="3.3.5" class="anchored" data-anchor-id="value_counts"><span class="header-section-number">3.3.5</span> <code>.value_counts()</code></h3>
+<p>The <code>Series.value_counts()</code> <a href="https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html">(documentation)</a> method counts the number of occurrence of each unique value in a <code>Series</code>. In other words, it <em>counts</em> the number of times each unique <em>value</em> appears. This is often useful for determining the most or least common entries in a <code>Series</code>.</p>
+<p>In the example below, we can determine the name with the most years in which at least one person has taken that name by counting the number of times each name appears in the <code>"Name"</code> column of <code>babynames</code>. Note that the return value is also a <code>Series</code>.</p>
+<div id="8ce0395f" class="cell" data-execution_count="31">
+<div class="sourceCode cell-code" id="cb41"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"Name"</span>].value_counts().head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="31">
+<pre><code>Name
+Jean         223
+Francis      221
+Guadalupe    218
+Jessie       217
+Marion       214
+Name: count, dtype: int64</code></pre>
+</div>
+</div>
+</section>
+<section id="unique" class="level3" data-number="3.3.6">
+<h3 data-number="3.3.6" class="anchored" data-anchor-id="unique"><span class="header-section-number">3.3.6</span> <code>.unique()</code></h3>
+<p>If we have a <code>Series</code> with many repeated values, then <code>.unique()</code> <a href="https://pandas.pydata.org/docs/reference/api/pandas.unique.html">(documentation)</a> can be used to identify only the <em>unique</em> values. Here we return an array of all the names in <code>babynames</code>.</p>
+<div id="7a7e69da" class="cell" data-execution_count="32">
+<div class="sourceCode cell-code" id="cb43"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"Name"</span>].unique()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="32">
+<pre><code>array(['Mary', 'Helen', 'Dorothy', ..., 'Zae', 'Zai', 'Zayvier'],
+      dtype=object)</code></pre>
+</div>
+</div>
+</section>
+<section id="sort_values" class="level3" data-number="3.3.7">
+<h3 data-number="3.3.7" class="anchored" data-anchor-id="sort_values"><span class="header-section-number">3.3.7</span> <code>.sort_values()</code></h3>
+<p>Ordering a <code>DataFrame</code> can be useful for isolating extreme values. For example, the first 5 entries of a row sorted in descending order (that is, from highest to lowest) are the largest 5 values. <code>.sort_values</code> <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html">(documentation)</a> allows us to order a <code>DataFrame</code> or <code>Series</code> by a specified column. We can choose to either receive the rows in <code>ascending</code> order (default) or <code>descending</code> order.</p>
+<div id="72d5b39a" class="cell" data-execution_count="33">
+<div class="sourceCode cell-code" id="cb45"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb45-1"><a href="#cb45-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Sort the "Count" column from highest to lowest</span></span>
+<span id="cb45-2"><a href="#cb45-2" aria-hidden="true" tabindex="-1"></a>babynames.sort_values(by<span class="op">=</span><span class="st">"Count"</span>, ascending<span class="op">=</span><span class="va">False</span>).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="33">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">268041</td>
+<td>CA</td>
+<td>M</td>
+<td>1957</td>
+<td>Michael</td>
+<td>8260</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">267017</td>
+<td>CA</td>
+<td>M</td>
+<td>1956</td>
+<td>Michael</td>
+<td>8258</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">317387</td>
+<td>CA</td>
+<td>M</td>
+<td>1990</td>
+<td>Michael</td>
+<td>8246</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">281850</td>
+<td>CA</td>
+<td>M</td>
+<td>1969</td>
+<td>Michael</td>
+<td>8245</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">283146</td>
+<td>CA</td>
+<td>M</td>
+<td>1970</td>
+<td>Michael</td>
+<td>8196</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Unlike when calling <code>.value_counts()</code> on a <code>DataFrame</code>, we do not need to explicitly specify the column used for sorting when calling <code>.value_counts()</code> on a <code>Series</code>. We can still specify the ordering paradigm – that is, whether values are sorted in ascending or descending order.</p>
+<div id="e78e42b4" class="cell" data-execution_count="34">
+<div class="sourceCode cell-code" id="cb46"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Sort the "Name" Series alphabetically</span></span>
+<span id="cb46-2"><a href="#cb46-2" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"Name"</span>].sort_values(ascending<span class="op">=</span><span class="va">True</span>).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="34">
+<pre><code>366001      Aadan
+384005      Aadan
+369120      Aadan
+398211    Aadarsh
+370306      Aaden
+Name: Name, dtype: object</code></pre>
+</div>
+</div>
+</section>
+</section>
+<section id="parting-note" class="level2" data-number="3.4">
+<h2 data-number="3.4" class="anchored" data-anchor-id="parting-note"><span class="header-section-number">3.4</span> Parting Note</h2>
+<p>Manipulating <code>DataFrames</code> is not a skill that is mastered in just one day. Due to the flexibility of <code>pandas</code>, there are many different ways to get from point A to point B. We recommend trying multiple different ways to solve the same problem to gain even more practice and reach that point of mastery sooner.</p>
+<p>Next, we will start digging deeper into the mechanics behind grouping data.</p>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../pandas_1/pandas_1.html" class="pagination-link" aria-label="Pandas I">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../pandas_3/pandas_3.html" class="pagination-link" aria-label="Pandas III">
+        <span class="nav-page-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/pandas_3/images/agg.png b/docs/pandas_3/images/agg.png
new file mode 100644
index 000000000..ec5e8e430
Binary files /dev/null and b/docs/pandas_3/images/agg.png differ
diff --git a/docs/pandas_3/images/aggregation.png b/docs/pandas_3/images/aggregation.png
new file mode 100644
index 000000000..7eb718c81
Binary files /dev/null and b/docs/pandas_3/images/aggregation.png differ
diff --git a/docs/pandas_3/images/error.png b/docs/pandas_3/images/error.png
new file mode 100644
index 000000000..fcf7f141f
Binary files /dev/null and b/docs/pandas_3/images/error.png differ
diff --git a/docs/pandas_3/images/filter_demo.png b/docs/pandas_3/images/filter_demo.png
new file mode 100644
index 000000000..669da3257
Binary files /dev/null and b/docs/pandas_3/images/filter_demo.png differ
diff --git a/docs/pandas_3/images/first.png b/docs/pandas_3/images/first.png
new file mode 100644
index 000000000..f44b90d00
Binary files /dev/null and b/docs/pandas_3/images/first.png differ
diff --git a/docs/pandas_3/images/gb.png b/docs/pandas_3/images/gb.png
new file mode 100644
index 000000000..4c8abae60
Binary files /dev/null and b/docs/pandas_3/images/gb.png differ
diff --git a/docs/pandas_3/images/groupby_demo.png b/docs/pandas_3/images/groupby_demo.png
new file mode 100644
index 000000000..f87b62e82
Binary files /dev/null and b/docs/pandas_3/images/groupby_demo.png differ
diff --git a/docs/pandas_3/images/pivot.png b/docs/pandas_3/images/pivot.png
new file mode 100644
index 000000000..667ae45be
Binary files /dev/null and b/docs/pandas_3/images/pivot.png differ
diff --git a/docs/pandas_3/images/puzzle_demo.png b/docs/pandas_3/images/puzzle_demo.png
new file mode 100644
index 000000000..bc21fd910
Binary files /dev/null and b/docs/pandas_3/images/puzzle_demo.png differ
diff --git a/docs/pandas_3/pandas_3.html b/docs/pandas_3/pandas_3.html
new file mode 100644
index 000000000..dad3756da
--- /dev/null
+++ b/docs/pandas_3/pandas_3.html
@@ -0,0 +1,4841 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>4&nbsp; Pandas III – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../eda/eda.html" rel="next">
+<link href="../pandas_2/pandas_2.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+<script type="text/javascript">
+window.PlotlyConfig = {MathJaxConfig: 'local'};
+if (window.MathJax && window.MathJax.Hub && window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: "STIX-Web"}});}
+if (typeof require !== 'undefined') {
+require.undef("plotly");
+requirejs.config({
+    paths: {
+        'plotly': ['https://cdn.plot.ly/plotly-2.34.0.min']
+    }
+});
+require(['plotly'], function(Plotly) {
+    window._Plotly = Plotly;
+});
+}
+</script>
+
+
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../pandas_3/pandas_3.html"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Pandas III</h2>
+   
+  <ul>
+  <li><a href="#custom-sorts" id="toc-custom-sorts" class="nav-link active" data-scroll-target="#custom-sorts"><span class="header-section-number">4.1</span> Custom Sorts</a>
+  <ul>
+  <li><a href="#approach-1-create-a-temporary-column" id="toc-approach-1-create-a-temporary-column" class="nav-link" data-scroll-target="#approach-1-create-a-temporary-column"><span class="header-section-number">4.1.1</span> Approach 1: Create a Temporary Column</a></li>
+  <li><a href="#approach-2-sorting-using-the-key-argument" id="toc-approach-2-sorting-using-the-key-argument" class="nav-link" data-scroll-target="#approach-2-sorting-using-the-key-argument"><span class="header-section-number">4.1.2</span> Approach 2: Sorting using the <code>key</code> Argument</a></li>
+  <li><a href="#approach-3-sorting-using-the-map-function" id="toc-approach-3-sorting-using-the-map-function" class="nav-link" data-scroll-target="#approach-3-sorting-using-the-map-function"><span class="header-section-number">4.1.3</span> Approach 3: Sorting using the <code>map</code> Function</a></li>
+  </ul></li>
+  <li><a href="#aggregating-data-with-.groupby" id="toc-aggregating-data-with-.groupby" class="nav-link" data-scroll-target="#aggregating-data-with-.groupby"><span class="header-section-number">4.2</span> Aggregating Data with <code>.groupby</code></a>
+  <ul>
+  <li><a href="#aggregation-functions" id="toc-aggregation-functions" class="nav-link" data-scroll-target="#aggregation-functions"><span class="header-section-number">4.2.1</span> Aggregation Functions</a></li>
+  <li><a href="#plotting-birth-counts" id="toc-plotting-birth-counts" class="nav-link" data-scroll-target="#plotting-birth-counts"><span class="header-section-number">4.2.2</span> Plotting Birth Counts</a></li>
+  <li><a href="#summary-of-the-.groupby-function" id="toc-summary-of-the-.groupby-function" class="nav-link" data-scroll-target="#summary-of-the-.groupby-function"><span class="header-section-number">4.2.3</span> Summary of the <code>.groupby()</code> Function</a></li>
+  <li><a href="#revisiting-the-.agg-function" id="toc-revisiting-the-.agg-function" class="nav-link" data-scroll-target="#revisiting-the-.agg-function"><span class="header-section-number">4.2.4</span> Revisiting the <code>.agg()</code> Function</a></li>
+  <li><a href="#nuisance-columns" id="toc-nuisance-columns" class="nav-link" data-scroll-target="#nuisance-columns"><span class="header-section-number">4.2.5</span> Nuisance Columns</a></li>
+  <li><a href="#renaming-columns-after-grouping" id="toc-renaming-columns-after-grouping" class="nav-link" data-scroll-target="#renaming-columns-after-grouping"><span class="header-section-number">4.2.6</span> Renaming Columns After Grouping</a></li>
+  <li><a href="#some-data-science-payoff" id="toc-some-data-science-payoff" class="nav-link" data-scroll-target="#some-data-science-payoff"><span class="header-section-number">4.2.7</span> Some Data Science Payoff</a></li>
+  </ul></li>
+  <li><a href="#groupby-continued" id="toc-groupby-continued" class="nav-link" data-scroll-target="#groupby-continued"><span class="header-section-number">4.3</span> <code>.groupby()</code>, Continued</a>
+  <ul>
+  <li><a href="#raw-groupby-objects" id="toc-raw-groupby-objects" class="nav-link" data-scroll-target="#raw-groupby-objects"><span class="header-section-number">4.3.1</span> Raw <code>GroupBy</code> Objects</a></li>
+  <li><a href="#other-groupby-methods" id="toc-other-groupby-methods" class="nav-link" data-scroll-target="#other-groupby-methods"><span class="header-section-number">4.3.2</span> Other <code>GroupBy</code> Methods</a></li>
+  <li><a href="#filtering-by-group" id="toc-filtering-by-group" class="nav-link" data-scroll-target="#filtering-by-group"><span class="header-section-number">4.3.3</span> Filtering by Group</a></li>
+  <li><a href="#aggregation-with-lambda-functions" id="toc-aggregation-with-lambda-functions" class="nav-link" data-scroll-target="#aggregation-with-lambda-functions"><span class="header-section-number">4.3.4</span> Aggregation with <code>lambda</code> Functions</a></li>
+  </ul></li>
+  <li><a href="#aggregating-data-with-pivot-tables" id="toc-aggregating-data-with-pivot-tables" class="nav-link" data-scroll-target="#aggregating-data-with-pivot-tables"><span class="header-section-number">4.4</span> Aggregating Data with Pivot Tables</a></li>
+  <li><a href="#joining-tables" id="toc-joining-tables" class="nav-link" data-scroll-target="#joining-tables"><span class="header-section-number">4.5</span> Joining Tables</a></li>
+  <li><a href="#parting-note" id="toc-parting-note" class="nav-link" data-scroll-target="#parting-note"><span class="header-section-number">4.6</span> Parting Note</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Perform advanced aggregation using <code>.groupby()</code></li>
+<li>Use the <code>pd.pivot_table</code> method to construct a pivot table</li>
+<li>Perform simple merges between DataFrames using <code>pd.merge()</code></li>
+</ul>
+</div>
+</div>
+</div>
+<p>We will introduce the concept of aggregating data – we will familiarize ourselves with <code>GroupBy</code> objects and used them as tools to consolidate and summarize a<code>DataFrame</code>. In this lecture, we will explore working with the different aggregation functions and dive into some advanced <code>.groupby</code> methods to show just how powerful of a resource they can be for understanding our data. We will also introduce other techniques for data aggregation to provide flexibility in how we manipulate our tables.</p>
+<section id="custom-sorts" class="level2" data-number="4.1">
+<h2 data-number="4.1" class="anchored" data-anchor-id="custom-sorts"><span class="header-section-number">4.1</span> Custom Sorts</h2>
+<p>First, let’s finish our discussion about sorting. Let’s try to solve a sorting problem using different approaches. Assume we want to find the longest baby names and sort our data accordingly.</p>
+<p>We’ll start by loading the <code>babynames</code> dataset. Note that this dataset is filtered to only contain data from California.</p>
+<div id="4dc1ced5" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># This code pulls census data and loads it into a DataFrame</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># We won't cover it explicitly in this class, but you are welcome to explore it on your own</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> urllib.request</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os.path</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> zipfile</span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a>data_url <span class="op">=</span> <span class="st">"https://www.ssa.gov/oact/babynames/state/namesbystate.zip"</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>local_filename <span class="op">=</span> <span class="st">"data/babynamesbystate.zip"</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> <span class="kw">not</span> os.path.exists(local_filename): <span class="co"># If the data exists don't download again</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> urllib.request.urlopen(data_url) <span class="im">as</span> resp, <span class="bu">open</span>(local_filename, <span class="st">'wb'</span>) <span class="im">as</span> f:</span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a>        f.write(resp.read())</span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a>zf <span class="op">=</span> zipfile.ZipFile(local_filename, <span class="st">'r'</span>)</span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a>ca_name <span class="op">=</span> <span class="st">'STATE.CA.TXT'</span></span>
+<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a>field_names <span class="op">=</span> [<span class="st">'State'</span>, <span class="st">'Sex'</span>, <span class="st">'Year'</span>, <span class="st">'Name'</span>, <span class="st">'Count'</span>]</span>
+<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> zf.<span class="bu">open</span>(ca_name) <span class="im">as</span> fh:</span>
+<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a>    babynames <span class="op">=</span> pd.read_csv(fh, header<span class="op">=</span><span class="va">None</span>, names<span class="op">=</span>field_names)</span>
+<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a>babynames.tail(<span class="dv">10</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="1">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">407418</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zach</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">407419</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zadkiel</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">407420</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zae</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">407421</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zai</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">407422</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zay</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">407423</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zayvier</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">407424</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zia</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">407425</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zora</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">407426</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zuriel</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">407427</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Zylo</td>
+<td>5</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<section id="approach-1-create-a-temporary-column" class="level3" data-number="4.1.1">
+<h3 data-number="4.1.1" class="anchored" data-anchor-id="approach-1-create-a-temporary-column"><span class="header-section-number">4.1.1</span> Approach 1: Create a Temporary Column</h3>
+<p>One method to do this is to first start by creating a column that contains the lengths of the names.</p>
+<div id="3637fa1f" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a Series of the length of each name</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>babyname_lengths <span class="op">=</span> babynames[<span class="st">"Name"</span>].<span class="bu">str</span>.<span class="bu">len</span>()</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Add a column named "name_lengths" that includes the length of each name</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"name_lengths"</span>] <span class="op">=</span> babyname_lengths</span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="2">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+<th data-quarto-table-cell-role="th">name_lengths</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Mary</td>
+<td>295</td>
+<td>4</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Helen</td>
+<td>239</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Dorothy</td>
+<td>220</td>
+<td>7</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Margaret</td>
+<td>163</td>
+<td>8</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>CA</td>
+<td>F</td>
+<td>1910</td>
+<td>Frances</td>
+<td>134</td>
+<td>7</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We can then sort the <code>DataFrame</code> by that column using <code>.sort_values()</code>:</p>
+<div id="4f3c62a5" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Sort by the temporary column</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.sort_values(by<span class="op">=</span><span class="st">"name_lengths"</span>, ascending<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+<th data-quarto-table-cell-role="th">name_lengths</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">334166</td>
+<td>CA</td>
+<td>M</td>
+<td>1996</td>
+<td>Franciscojavier</td>
+<td>8</td>
+<td>15</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">337301</td>
+<td>CA</td>
+<td>M</td>
+<td>1997</td>
+<td>Franciscojavier</td>
+<td>5</td>
+<td>15</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">339472</td>
+<td>CA</td>
+<td>M</td>
+<td>1998</td>
+<td>Franciscojavier</td>
+<td>6</td>
+<td>15</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">321792</td>
+<td>CA</td>
+<td>M</td>
+<td>1991</td>
+<td>Ryanchristopher</td>
+<td>7</td>
+<td>15</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">327358</td>
+<td>CA</td>
+<td>M</td>
+<td>1993</td>
+<td>Johnchristopher</td>
+<td>5</td>
+<td>15</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Finally, we can drop the <code>name_length</code> column from <code>babynames</code> to prevent our table from getting cluttered.</p>
+<div id="4e360c9e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Drop the 'name_length' column</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.drop(<span class="st">"name_lengths"</span>, axis<span class="op">=</span><span class="st">'columns'</span>)</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">334166</td>
+<td>CA</td>
+<td>M</td>
+<td>1996</td>
+<td>Franciscojavier</td>
+<td>8</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">337301</td>
+<td>CA</td>
+<td>M</td>
+<td>1997</td>
+<td>Franciscojavier</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">339472</td>
+<td>CA</td>
+<td>M</td>
+<td>1998</td>
+<td>Franciscojavier</td>
+<td>6</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">321792</td>
+<td>CA</td>
+<td>M</td>
+<td>1991</td>
+<td>Ryanchristopher</td>
+<td>7</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">327358</td>
+<td>CA</td>
+<td>M</td>
+<td>1993</td>
+<td>Johnchristopher</td>
+<td>5</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="approach-2-sorting-using-the-key-argument" class="level3" data-number="4.1.2">
+<h3 data-number="4.1.2" class="anchored" data-anchor-id="approach-2-sorting-using-the-key-argument"><span class="header-section-number">4.1.2</span> Approach 2: Sorting using the <code>key</code> Argument</h3>
+<p>Another way to approach this is to use the <code>key</code> argument of <code>.sort_values()</code>. Here we can specify that we want to sort <code>"Name"</code> values by their length.</p>
+<div id="cec62aad" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>babynames.sort_values(<span class="st">"Name"</span>, key<span class="op">=</span><span class="kw">lambda</span> x: x.<span class="bu">str</span>.<span class="bu">len</span>(), ascending<span class="op">=</span><span class="va">False</span>).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">334166</td>
+<td>CA</td>
+<td>M</td>
+<td>1996</td>
+<td>Franciscojavier</td>
+<td>8</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">327472</td>
+<td>CA</td>
+<td>M</td>
+<td>1993</td>
+<td>Ryanchristopher</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">337301</td>
+<td>CA</td>
+<td>M</td>
+<td>1997</td>
+<td>Franciscojavier</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">337477</td>
+<td>CA</td>
+<td>M</td>
+<td>1997</td>
+<td>Ryanchristopher</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">312543</td>
+<td>CA</td>
+<td>M</td>
+<td>1987</td>
+<td>Franciscojavier</td>
+<td>5</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="approach-3-sorting-using-the-map-function" class="level3" data-number="4.1.3">
+<h3 data-number="4.1.3" class="anchored" data-anchor-id="approach-3-sorting-using-the-map-function"><span class="header-section-number">4.1.3</span> Approach 3: Sorting using the <code>map</code> Function</h3>
+<p>We can also use the <code>map</code> function on a <code>Series</code> to solve this. Say we want to sort the <code>babynames</code> table by the number of <code>"dr"</code>’s and <code>"ea"</code>’s in each <code>"Name"</code>. We’ll define the function <code>dr_ea_count</code> to help us out.</p>
+<div id="f080bb70" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># First, define a function to count the number of times "dr" or "ea" appear in each name</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> dr_ea_count(string):</span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> string.count(<span class="st">'dr'</span>) <span class="op">+</span> string.count(<span class="st">'ea'</span>)</span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Then, use `map` to apply `dr_ea_count` to each name in the "Name" column</span></span>
+<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"dr_ea_count"</span>] <span class="op">=</span> babynames[<span class="st">"Name"</span>].<span class="bu">map</span>(dr_ea_count)</span>
+<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Sort the DataFrame by the new "dr_ea_count" column so we can see our handiwork</span></span>
+<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.sort_values(by<span class="op">=</span><span class="st">"dr_ea_count"</span>, ascending<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>babynames.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+<th data-quarto-table-cell-role="th">dr_ea_count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">115957</td>
+<td>CA</td>
+<td>F</td>
+<td>1990</td>
+<td>Deandrea</td>
+<td>5</td>
+<td>3</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">101976</td>
+<td>CA</td>
+<td>F</td>
+<td>1986</td>
+<td>Deandrea</td>
+<td>6</td>
+<td>3</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">131029</td>
+<td>CA</td>
+<td>F</td>
+<td>1994</td>
+<td>Leandrea</td>
+<td>5</td>
+<td>3</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">108731</td>
+<td>CA</td>
+<td>F</td>
+<td>1988</td>
+<td>Deandrea</td>
+<td>5</td>
+<td>3</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">308131</td>
+<td>CA</td>
+<td>M</td>
+<td>1985</td>
+<td>Deandrea</td>
+<td>6</td>
+<td>3</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We can drop the <code>dr_ea_count</code> once we’re done using it to maintain a neat table.</p>
+<div id="69adfd06" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Drop the `dr_ea_count` column</span></span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.drop(<span class="st">"dr_ea_count"</span>, axis <span class="op">=</span> <span class="st">'columns'</span>)</span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">115957</td>
+<td>CA</td>
+<td>F</td>
+<td>1990</td>
+<td>Deandrea</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">101976</td>
+<td>CA</td>
+<td>F</td>
+<td>1986</td>
+<td>Deandrea</td>
+<td>6</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">131029</td>
+<td>CA</td>
+<td>F</td>
+<td>1994</td>
+<td>Leandrea</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">108731</td>
+<td>CA</td>
+<td>F</td>
+<td>1988</td>
+<td>Deandrea</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">308131</td>
+<td>CA</td>
+<td>M</td>
+<td>1985</td>
+<td>Deandrea</td>
+<td>6</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="aggregating-data-with-.groupby" class="level2" data-number="4.2">
+<h2 data-number="4.2" class="anchored" data-anchor-id="aggregating-data-with-.groupby"><span class="header-section-number">4.2</span> Aggregating Data with <code>.groupby</code></h2>
+<p>Up until this point, we have been working with individual rows of <code>DataFrame</code>s. As data scientists, we often wish to investigate trends across a larger <em>subset</em> of our data. For example, we may want to compute some summary statistic (the mean, median, sum, etc.) for a group of rows in our <code>DataFrame</code>. To do this, we’ll use <code>pandas</code> <code>GroupBy</code> objects. Our goal is to group together rows that fall under the same category and perform an operation that aggregates across all rows in the category.</p>
+<p>Let’s say we wanted to aggregate all rows in <code>babynames</code> for a given year.</p>
+<div id="614064fe" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Year"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<pre><code>&lt;pandas.core.groupby.generic.DataFrameGroupBy object at 0x1037c3d90&gt;</code></pre>
+</div>
+</div>
+<p>What does this strange output mean? Calling <code>.groupby</code> <a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html">(documentation)</a> has generated a <code>GroupBy</code> object. You can imagine this as a set of “mini” sub-<code>DataFrame</code>s, where each subframe contains all of the rows from <code>babynames</code> that correspond to a particular year.</p>
+<p>The diagram below shows a simplified view of <code>babynames</code> to help illustrate this idea.</p>
+<center>
+<img src="images/gb.png" width="600">
+</center>
+<p>We can’t work with a <code>GroupBy</code> object directly – that is why you saw that strange output earlier rather than a standard view of a <code>DataFrame</code>. To actually manipulate values within these “mini” <code>DataFrame</code>s, we’ll need to call an <em>aggregation method</em>. This is a method that tells <code>pandas</code> how to aggregate the values within the <code>GroupBy</code> object. Once the aggregation is applied, <code>pandas</code> will return a normal (now grouped) <code>DataFrame</code>.</p>
+<p>The first aggregation method we’ll consider is <code>.agg</code>. The <code>.agg</code> method takes in a function as its argument; this function is then applied to each column of a “mini” grouped DataFrame. We end up with a new <code>DataFrame</code> with one aggregated row per subframe. Let’s see this in action by finding the <code>sum</code> of all counts for each year in <code>babynames</code> – this is equivalent to finding the number of babies born in each year.</p>
+<div id="53ec08c9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>babynames[[<span class="st">"Year"</span>, <span class="st">"Count"</span>]].groupby(<span class="st">"Year"</span>).agg(<span class="bu">sum</span>).head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/2718070104.py:1: FutureWarning:
+
+The provided callable &lt;built-in function sum&gt; is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1910</td>
+<td>9163</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1911</td>
+<td>9983</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1912</td>
+<td>17946</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1913</td>
+<td>22094</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1914</td>
+<td>26926</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We can relate this back to the diagram we used above. Remember that the diagram uses a simplified version of <code>babynames</code>, which is why we see smaller values for the summed counts.</p>
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="images/agg.png" class="img-fluid figure-img"></p>
+<figcaption>Performing an aggregation</figcaption>
+</figure>
+</div>
+<p>Calling <code>.agg</code> has condensed each subframe back into a single row. This gives us our final output: a <code>DataFrame</code> that is now indexed by <code>"Year"</code>, with a single row for each unique year in the original <code>babynames</code> DataFrame.</p>
+<p>There are many different aggregation functions we can use, all of which are useful in different applications.</p>
+<div id="b4d33a46" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>babynames[[<span class="st">"Year"</span>, <span class="st">"Count"</span>]].groupby(<span class="st">"Year"</span>).agg(<span class="bu">min</span>).head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/86785752.py:1: FutureWarning:
+
+The provided callable &lt;built-in function min&gt; is currently using DataFrameGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "min" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="10">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1910</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1911</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1912</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1913</td>
+<td>5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1914</td>
+<td>5</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="dfabe358" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>babynames[[<span class="st">"Year"</span>, <span class="st">"Count"</span>]].groupby(<span class="st">"Year"</span>).agg(<span class="bu">max</span>).head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/3032256904.py:1: FutureWarning:
+
+The provided callable &lt;built-in function max&gt; is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="11">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1910</td>
+<td>295</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1911</td>
+<td>390</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1912</td>
+<td>534</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1913</td>
+<td>614</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1914</td>
+<td>773</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="3c9405ca" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Same result, but now we explicitly tell pandas to only consider the "Count" column when summing</span></span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Year"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>).head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/1958904241.py:2: FutureWarning:
+
+The provided callable &lt;built-in function sum&gt; is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="12">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1910</td>
+<td>9163</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1911</td>
+<td>9983</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1912</td>
+<td>17946</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1913</td>
+<td>22094</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1914</td>
+<td>26926</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>There are many different aggregations that can be applied to the grouped data. The primary requirement is that an aggregation function must:</p>
+<ul>
+<li>Take in a <code>Series</code> of data (a single column of the grouped subframe).</li>
+<li>Return a single value that aggregates this <code>Series</code>.</li>
+</ul>
+<section id="aggregation-functions" class="level3" data-number="4.2.1">
+<h3 data-number="4.2.1" class="anchored" data-anchor-id="aggregation-functions"><span class="header-section-number">4.2.1</span> Aggregation Functions</h3>
+<p>Because of this fairly broad requirement, <code>pandas</code> offers many ways of computing an aggregation.</p>
+<p><strong>In-built</strong> Python operations – such as <code>sum</code>, <code>max</code>, and <code>min</code> – are automatically recognized by <code>pandas</code>.</p>
+<div id="01299819" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="co"># What is the minimum count for each name in any year?</span></span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">min</span>).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/3244314896.py:2: FutureWarning:
+
+The provided callable &lt;built-in function min&gt; is currently using DataFrameGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "min" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="13">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadan</td>
+<td>5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadarsh</td>
+<td>6</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aaden</td>
+<td>10</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadhav</td>
+<td>6</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadhini</td>
+<td>6</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="9a5bb390" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="co"># What is the largest single-year count of each name?</span></span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">max</span>).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/3805876622.py:2: FutureWarning:
+
+The provided callable &lt;built-in function max&gt; is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadan</td>
+<td>7</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadarsh</td>
+<td>6</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aaden</td>
+<td>158</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadhav</td>
+<td>8</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadhini</td>
+<td>6</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>As mentioned previously, functions from the <code>NumPy</code> library, such as <code>np.mean</code>, <code>np.max</code>, <code>np.min</code>, and <code>np.sum</code>, are also fair game in <code>pandas</code>.</p>
+<div id="448e3831" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="co"># What is the average count for each name across all years?</span></span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Count"</span>]].agg(np.mean).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/308986604.py:2: FutureWarning:
+
+The provided callable &lt;function mean at 0x103985360&gt; is currently using DataFrameGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="15">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadan</td>
+<td>6.000000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadarsh</td>
+<td>6.000000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aaden</td>
+<td>46.214286</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadhav</td>
+<td>6.750000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadhini</td>
+<td>6.000000</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p><code>pandas</code> also offers a number of in-built functions. Functions that are native to <code>pandas</code> can be referenced using their string name within a call to <code>.agg</code>. Some examples include:</p>
+<ul>
+<li><code>.agg("sum")</code></li>
+<li><code>.agg("max")</code></li>
+<li><code>.agg("min")</code></li>
+<li><code>.agg("mean")</code></li>
+<li><code>.agg("first")</code></li>
+<li><code>.agg("last")</code></li>
+</ul>
+<p>The latter two entries in this list – <code>"first"</code> and <code>"last"</code> – are unique to <code>pandas</code>. They return the first or last entry in a subframe column. Why might this be useful? Consider a case where <em>multiple</em> columns in a group share identical information. To represent this information in the grouped output, we can simply grab the first or last entry, which we know will be identical to all other entries.</p>
+<p>Let’s illustrate this with an example. Say we add a new column to <code>babynames</code> that contains the first letter of each name.</p>
+<div id="1ebde37d" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Imagine we had an additional column, "First Letter". We'll explain this code next week</span></span>
+<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"First Letter"</span>] <span class="op">=</span> babynames[<span class="st">"Name"</span>].<span class="bu">str</span>[<span class="dv">0</span>]</span>
+<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a><span class="co"># We construct a simplified DataFrame containing just a subset of columns</span></span>
+<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a>babynames_new <span class="op">=</span> babynames[[<span class="st">"Name"</span>, <span class="st">"First Letter"</span>, <span class="st">"Year"</span>]]</span>
+<span id="cb24-6"><a href="#cb24-6" aria-hidden="true" tabindex="-1"></a>babynames_new.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="16">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">First Letter</th>
+<th data-quarto-table-cell-role="th">Year</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">115957</td>
+<td>Deandrea</td>
+<td>D</td>
+<td>1990</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">101976</td>
+<td>Deandrea</td>
+<td>D</td>
+<td>1986</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">131029</td>
+<td>Leandrea</td>
+<td>L</td>
+<td>1994</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">108731</td>
+<td>Deandrea</td>
+<td>D</td>
+<td>1988</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">308131</td>
+<td>Deandrea</td>
+<td>D</td>
+<td>1985</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>If we form groups for each name in the dataset, <code>"First Letter"</code> will be the same for all members of the group. This means that if we simply select the first entry for <code>"First Letter"</code> in the group, we’ll represent all data in that group.</p>
+<p>We can use a dictionary to apply different aggregation functions to each column during grouping.</p>
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="images/first.png" class="img-fluid figure-img"></p>
+<figcaption>Aggregating using “first”</figcaption>
+</figure>
+</div>
+<div id="777a8142" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>babynames_new.groupby(<span class="st">"Name"</span>).agg({<span class="st">"First Letter"</span>:<span class="st">"first"</span>, <span class="st">"Year"</span>:<span class="st">"max"</span>}).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="17">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">First Letter</th>
+<th data-quarto-table-cell-role="th">Year</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadan</td>
+<td>A</td>
+<td>2014</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadarsh</td>
+<td>A</td>
+<td>2019</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aaden</td>
+<td>A</td>
+<td>2020</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadhav</td>
+<td>A</td>
+<td>2019</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadhini</td>
+<td>A</td>
+<td>2022</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="plotting-birth-counts" class="level3" data-number="4.2.2">
+<h3 data-number="4.2.2" class="anchored" data-anchor-id="plotting-birth-counts"><span class="header-section-number">4.2.2</span> Plotting Birth Counts</h3>
+<p>Let’s use <code>.agg</code> to find the total number of babies born in each year. Recall that using <code>.agg</code> with <code>.groupby()</code> follows the format: <code>df.groupby(column_name).agg(aggregation_function)</code>. The line of code below gives us the total number of babies born in each year.</p>
+<div id="91d846a4" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="18">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Year"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>).head(<span class="dv">5</span>)</span>
+<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Alternative 1</span></span>
+<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a><span class="co"># babynames.groupby("Year")[["Count"]].sum()</span></span>
+<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Alternative 2</span></span>
+<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a><span class="co"># babynames.groupby("Year").sum(numeric_only=True)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/390646742.py:1: FutureWarning:
+
+The provided callable &lt;built-in function sum&gt; is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="18">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1910</td>
+<td>9163</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1911</td>
+<td>9983</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1912</td>
+<td>17946</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1913</td>
+<td>22094</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1914</td>
+<td>26926</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Here’s an illustration of the process:</p>
+<p><img src="images/aggregation.png" alt="aggregation" width="600"></p>
+<p>Plotting the <code>Dataframe</code> we obtain tells an interesting story.</p>
+<div id="abab0cbc" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="19">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.express <span class="im">as</span> px</span>
+<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>puzzle2 <span class="op">=</span> babynames.groupby(<span class="st">"Year"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>)</span>
+<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a>px.line(puzzle2, y <span class="op">=</span> <span class="st">"Count"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/4066413905.py:2: FutureWarning:
+
+The provided callable &lt;built-in function sum&gt; is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>                            <div id="1a22b146-2baa-4d29-bdc4-9287dc056e1a" class="plotly-graph-div" style="height:525px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("1a22b146-2baa-4d29-bdc4-9287dc056e1a")) {                    Plotly.newPlot(                        "1a22b146-2baa-4d29-bdc4-9287dc056e1a",                        [{"hovertemplate":"Year=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","line":{"color":"#636efa","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"","orientation":"v","showlegend":false,"x":[1910,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022],"xaxis":"x","y":[9163,9983,17946,22094,26926,35835,37501,39916,44692,45119,54142,58983,61004,67917,74451,73493,72910,74201,74264,72108,75294,71467,69522,66895,69789,71603,74932,83738,91626,93461,102627,114296,142033,159813,164349,171764,204945,232313,229033,233625,235582,250468,271681,287484,297099,304567,324186,340083,337562,345901,358544,363926,360475,361897,355386,336567,319421,318819,321040,333671,342411,310020,287239,275036,286947,290518,302547,315011,322241,343070,365973,382156,390581,394608,404961,425583,435964,453824,480602,512615,552647,549317,541054,524983,509302,494635,483288,468412,464300,460844,471649,466934,467742,477651,480892,484503,494971,497627,483360,460305,444619,437818,439402,431945,440683,431317,427015,411058,395436,386996,362882,362582,360023],"yaxis":"y","type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"Year"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Count"}},"legend":{"tracegroupgap":0}},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('1a22b146-2baa-4d29-bdc4-9287dc056e1a');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p><strong>A word of warning</strong>: we made an enormous assumption when we decided to use this dataset to estimate birth rate. According to <a href="https://lao.ca.gov/LAOEconTax/Article/Detail/691">this article from the Legistlative Analyst Office</a>, the true number of babies born in California in 2020 was 421,275. However, our plot shows 362,882 babies —— what happened?</p>
+</section>
+<section id="summary-of-the-.groupby-function" class="level3" data-number="4.2.3">
+<h3 data-number="4.2.3" class="anchored" data-anchor-id="summary-of-the-.groupby-function"><span class="header-section-number">4.2.3</span> Summary of the <code>.groupby()</code> Function</h3>
+<p>A <code>groupby</code> operation involves some combination of <strong>splitting a <code>DataFrame</code> into grouped subframes</strong>, <strong>applying a function</strong>, and <strong>combining the results</strong>.</p>
+<p>For some arbitrary <code>DataFrame</code> <code>df</code> below, the code <code>df.groupby("year").agg(sum)</code> does the following:</p>
+<ul>
+<li><strong>Splits</strong> the <code>DataFrame</code> into sub-<code>DataFrame</code>s with rows belonging to the same year.</li>
+<li><strong>Applies</strong> the <code>sum</code> function to each column of each sub-<code>DataFrame</code>.</li>
+<li><strong>Combines</strong> the results of <code>sum</code> into a single <code>DataFrame</code>, indexed by <code>year</code>.</li>
+</ul>
+<p><img src="images/groupby_demo.png" alt="groupby_demo" width="600"></p>
+</section>
+<section id="revisiting-the-.agg-function" class="level3" data-number="4.2.4">
+<h3 data-number="4.2.4" class="anchored" data-anchor-id="revisiting-the-.agg-function"><span class="header-section-number">4.2.4</span> Revisiting the <code>.agg()</code> Function</h3>
+<p><code>.agg()</code> can take in any function that aggregates several values into one summary value. Some commonly-used aggregation functions can even be called directly, without explicit use of <code>.agg()</code>. For example, we can call <code>.mean()</code> on <code>.groupby()</code>:</p>
+<pre><code>babynames.groupby("Year").mean().head()</code></pre>
+<p>We can now put this all into practice. Say we want to find the baby name with sex “F” that has fallen in popularity the most in California. To calculate this, we can first create a metric: “Ratio to Peak” (RTP). The RTP is the ratio of babies born with a given name in 2022 to the <em>maximum</em> number of babies born with the name in <em>any</em> year.</p>
+<p>Let’s start with calculating this for one baby, “Jennifer”.</p>
+<div id="8513cff4" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="20">
+<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="co"># We filter by babies with sex "F" and sort by "Year"</span></span>
+<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a>f_babynames <span class="op">=</span> babynames[babynames[<span class="st">"Sex"</span>] <span class="op">==</span> <span class="st">"F"</span>]</span>
+<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a>f_babynames <span class="op">=</span> f_babynames.sort_values([<span class="st">"Year"</span>])</span>
+<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Determine how many Jennifers were born in CA per year</span></span>
+<span id="cb31-6"><a href="#cb31-6" aria-hidden="true" tabindex="-1"></a>jenn_counts_series <span class="op">=</span> f_babynames[f_babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Jennifer"</span>][<span class="st">"Count"</span>]</span>
+<span id="cb31-7"><a href="#cb31-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb31-8"><a href="#cb31-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Determine the max number of Jennifers born in a year and the number born in 2022 </span></span>
+<span id="cb31-9"><a href="#cb31-9" aria-hidden="true" tabindex="-1"></a><span class="co"># to calculate RTP</span></span>
+<span id="cb31-10"><a href="#cb31-10" aria-hidden="true" tabindex="-1"></a>max_jenn <span class="op">=</span> <span class="bu">max</span>(f_babynames[f_babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Jennifer"</span>][<span class="st">"Count"</span>])</span>
+<span id="cb31-11"><a href="#cb31-11" aria-hidden="true" tabindex="-1"></a>curr_jenn <span class="op">=</span> f_babynames[f_babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Jennifer"</span>][<span class="st">"Count"</span>].iloc[<span class="op">-</span><span class="dv">1</span>]</span>
+<span id="cb31-12"><a href="#cb31-12" aria-hidden="true" tabindex="-1"></a>rtp <span class="op">=</span> curr_jenn <span class="op">/</span> max_jenn</span>
+<span id="cb31-13"><a href="#cb31-13" aria-hidden="true" tabindex="-1"></a>rtp</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="20">
+<pre><code>np.float64(0.018796372629843364)</code></pre>
+</div>
+</div>
+<p>By creating a function to calculate RTP and applying it to our <code>DataFrame</code> by using <code>.groupby()</code>, we can easily compute the RTP for all names at once!</p>
+<div id="6c59026b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="21">
+<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> ratio_to_peak(series):</span>
+<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> series.iloc[<span class="op">-</span><span class="dv">1</span>] <span class="op">/</span> <span class="bu">max</span>(series)</span>
+<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a><span class="co">#Using .groupby() to apply the function</span></span>
+<span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a>rtp_table <span class="op">=</span> f_babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Year"</span>, <span class="st">"Count"</span>]].agg(ratio_to_peak)</span>
+<span id="cb33-6"><a href="#cb33-6" aria-hidden="true" tabindex="-1"></a>rtp_table.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="21">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadhini</td>
+<td>1.0</td>
+<td>1.000000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadhira</td>
+<td>1.0</td>
+<td>0.500000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadhya</td>
+<td>1.0</td>
+<td>0.660000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadya</td>
+<td>1.0</td>
+<td>0.586207</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aahana</td>
+<td>1.0</td>
+<td>0.269231</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>In the rows shown above, we can see that every row shown has a <code>Year</code> value of <code>1.0</code>.</p>
+<p>This is the “<strong><code>pandas</code></strong>-ification” of logic you saw in Data 8. Much of the logic you’ve learned in Data 8 will serve you well in Data 100.</p>
+</section>
+<section id="nuisance-columns" class="level3" data-number="4.2.5">
+<h3 data-number="4.2.5" class="anchored" data-anchor-id="nuisance-columns"><span class="header-section-number">4.2.5</span> Nuisance Columns</h3>
+<p>Note that you must be careful with which columns you apply the <code>.agg()</code> function to. If we were to apply our function to the table as a whole by doing <code>f_babynames.groupby("Name").agg(ratio_to_peak)</code>, executing our <code>.agg()</code> call would result in a <code>TypeError</code>.</p>
+<p><img src="images/error.png" alt="error" width="600"></p>
+<p>We can avoid this issue (and prevent unintentional loss of data) by explicitly selecting column(s) we want to apply our aggregation function to <strong>BEFORE</strong> calling <code>.agg()</code>,</p>
+</section>
+<section id="renaming-columns-after-grouping" class="level3" data-number="4.2.6">
+<h3 data-number="4.2.6" class="anchored" data-anchor-id="renaming-columns-after-grouping"><span class="header-section-number">4.2.6</span> Renaming Columns After Grouping</h3>
+<p>By default, <code>.groupby</code> will not rename any aggregated columns. As we can see in the table above, the aggregated column is still named <code>Count</code> even though it now represents the RTP. For better readability, we can rename <code>Count</code> to <code>Count RTP</code></p>
+<div id="0eb1fcd9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="22">
+<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a>rtp_table <span class="op">=</span> rtp_table.rename(columns <span class="op">=</span> {<span class="st">"Count"</span>: <span class="st">"Count RTP"</span>})</span>
+<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a>rtp_table</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="22">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Count RTP</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadhini</td>
+<td>1.0</td>
+<td>1.000000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadhira</td>
+<td>1.0</td>
+<td>0.500000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadhya</td>
+<td>1.0</td>
+<td>0.660000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadya</td>
+<td>1.0</td>
+<td>0.586207</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aahana</td>
+<td>1.0</td>
+<td>0.269231</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Zyanya</td>
+<td>1.0</td>
+<td>0.466667</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Zyla</td>
+<td>1.0</td>
+<td>1.000000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Zylah</td>
+<td>1.0</td>
+<td>1.000000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Zyra</td>
+<td>1.0</td>
+<td>1.000000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Zyrah</td>
+<td>1.0</td>
+<td>0.833333</td>
+</tr>
+</tbody>
+</table>
+
+<p>13782 rows × 2 columns</p>
+</div>
+</div>
+</div>
+</section>
+<section id="some-data-science-payoff" class="level3" data-number="4.2.7">
+<h3 data-number="4.2.7" class="anchored" data-anchor-id="some-data-science-payoff"><span class="header-section-number">4.2.7</span> Some Data Science Payoff</h3>
+<p>By sorting <code>rtp_table</code>, we can see the names whose popularity has decreased the most.</p>
+<div id="ccebaa7f" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="23">
+<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>rtp_table <span class="op">=</span> rtp_table.rename(columns <span class="op">=</span> {<span class="st">"Count"</span>: <span class="st">"Count RTP"</span>})</span>
+<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a>rtp_table.sort_values(<span class="st">"Count RTP"</span>).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="23">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Count RTP</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Debra</td>
+<td>1.0</td>
+<td>0.001260</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Debbie</td>
+<td>1.0</td>
+<td>0.002815</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Carol</td>
+<td>1.0</td>
+<td>0.003180</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Tammy</td>
+<td>1.0</td>
+<td>0.003249</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Susan</td>
+<td>1.0</td>
+<td>0.003305</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>To visualize the above <code>DataFrame</code>, let’s look at the line plot below:</p>
+<div id="d8eaeacf" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="24">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.express <span class="im">as</span> px</span>
+<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>px.line(f_babynames[f_babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Debra"</span>], x <span class="op">=</span> <span class="st">"Year"</span>, y <span class="op">=</span> <span class="st">"Count"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="6f4cad3c-78e2-49b4-bee3-7491dfa399ff" class="plotly-graph-div" style="height:525px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("6f4cad3c-78e2-49b4-bee3-7491dfa399ff")) {                    Plotly.newPlot(                        "6f4cad3c-78e2-49b4-bee3-7491dfa399ff",                        [{"hovertemplate":"Year=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","line":{"color":"#636efa","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"","orientation":"v","showlegend":false,"x":[1940,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2012,2013,2016],"xaxis":"x","y":[7,7,8,15,19,20,56,92,199,601,1510,2351,3295,3784,3969,3755,3318,2660,2290,2014,1647,1592,1430,1287,1154,958,818,748,647,547,463,318,242,236,159,151,151,164,130,141,97,114,97,95,93,64,78,69,71,51,62,41,34,28,28,12,14,16,10,13,14,10,7,12,13,12,13,6,7,5,8,5],"yaxis":"y","type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"Year"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Count"}},"legend":{"tracegroupgap":0}},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('6f4cad3c-78e2-49b4-bee3-7491dfa399ff');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>We can get the list of the top 10 names and then plot popularity with the following code:</p>
+<div id="9ceaf843" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="25">
+<div class="sourceCode cell-code" id="cb37"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a>top10 <span class="op">=</span> rtp_table.sort_values(<span class="st">"Count RTP"</span>).head(<span class="dv">10</span>).index</span>
+<span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a>px.line(</span>
+<span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a>    f_babynames[f_babynames[<span class="st">"Name"</span>].isin(top10)], </span>
+<span id="cb37-4"><a href="#cb37-4" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> <span class="st">"Year"</span>, </span>
+<span id="cb37-5"><a href="#cb37-5" aria-hidden="true" tabindex="-1"></a>    y <span class="op">=</span> <span class="st">"Count"</span>, </span>
+<span id="cb37-6"><a href="#cb37-6" aria-hidden="true" tabindex="-1"></a>    color <span class="op">=</span> <span class="st">"Name"</span></span>
+<span id="cb37-7"><a href="#cb37-7" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>                            <div id="40f51764-c72f-4590-b7e9-3c50d7776e99" class="plotly-graph-div" style="height:525px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("40f51764-c72f-4590-b7e9-3c50d7776e99")) {                    Plotly.newPlot(                        "40f51764-c72f-4590-b7e9-3c50d7776e99",                        [{"hovertemplate":"Name=Carol\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Carol","line":{"color":"#636efa","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Carol","orientation":"v","showlegend":true,"x":[1910,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022],"xaxis":"x","y":[8,13,17,16,26,38,59,47,55,48,64,67,74,94,138,153,151,148,193,279,270,297,367,453,559,669,873,1015,1050,1109,1079,1339,1672,1937,2089,2138,2152,2201,1954,1779,1737,1734,1727,1597,1684,1651,1704,1703,1545,1480,1359,1283,1191,993,1034,815,622,577,543,468,366,267,223,187,173,146,145,145,121,132,123,128,106,114,111,101,120,107,108,134,150,136,129,89,92,75,87,64,61,46,64,33,43,47,52,76,62,38,44,26,17,47,31,36,24,13,25,18,29,20,17,8,7],"yaxis":"y","type":"scatter"},{"hovertemplate":"Name=Susan\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Susan","line":{"color":"#EF553B","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Susan","orientation":"v","showlegend":true,"x":[1911,1912,1913,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022],"xaxis":"x","y":[6,8,8,10,16,17,15,20,22,21,19,15,22,26,32,29,43,25,37,63,47,63,74,101,118,138,183,271,433,630,795,1058,1380,1596,1991,2689,2831,3338,3180,3260,3346,3424,3753,3934,3900,3771,3631,3504,3123,3145,3135,2952,2839,2535,2008,1825,1644,1367,1232,1070,861,651,530,552,496,456,437,424,409,420,361,391,352,338,273,280,272,286,267,272,260,196,202,172,152,152,114,116,103,100,104,85,76,70,71,74,53,56,41,39,43,28,44,26,45,22,26,22,19,17,8,13],"yaxis":"y","type":"scatter"},{"hovertemplate":"Name=Tina\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Tina","line":{"color":"#00cc96","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Tina","orientation":"v","showlegend":true,"x":[1915,1916,1917,1918,1920,1921,1922,1924,1925,1927,1928,1929,1930,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022],"xaxis":"x","y":[5,6,5,5,5,7,5,9,5,8,8,5,10,10,7,8,12,9,28,45,43,53,64,80,80,88,92,128,168,163,177,366,569,569,700,753,889,1045,1228,1212,1129,1202,1282,1342,1402,1302,1248,1091,941,634,642,546,450,370,414,363,335,371,310,268,271,310,238,252,252,208,180,196,163,171,147,121,111,91,80,83,90,80,67,64,63,69,36,37,47,39,39,27,39,28,46,38,33,36,26,21,15,13,6],"yaxis":"y","type":"scatter"},{"hovertemplate":"Name=Cheryl\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Cheryl","line":{"color":"#ab63fa","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Cheryl","orientation":"v","showlegend":true,"x":[1930,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2011,2012,2013,2014,2015,2016,2017,2018,2019,2021,2022],"xaxis":"x","y":[6,8,12,10,16,76,49,42,48,87,377,759,801,1063,1093,1021,916,903,993,955,1058,1465,1639,1715,1833,1832,1639,1624,1565,1420,1295,1207,1051,950,899,751,635,550,428,371,293,271,236,199,178,303,299,272,204,229,164,135,129,130,98,106,88,90,65,55,39,47,38,30,30,19,22,24,14,11,16,17,16,13,21,14,11,15,12,10,12,15,8,10,9,6,7],"yaxis":"y","type":"scatter"},{"hovertemplate":"Name=Michele\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Michele","line":{"color":"#FFA15A","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Michele","orientation":"v","showlegend":true,"x":[1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2022],"xaxis":"x","y":[7,8,5,8,18,34,113,132,166,171,172,253,213,335,295,306,401,421,500,498,464,454,470,506,576,763,766,775,768,796,1037,1033,1111,1016,973,700,702,571,494,484,437,390,381,305,281,223,230,227,200,162,206,146,143,164,137,142,125,104,82,65,52,47,45,38,28,37,27,22,28,16,21,15,15,11,14,7,5,10,6,11,5],"yaxis":"y","type":"scatter"},{"hovertemplate":"Name=Debbie\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Debbie","line":{"color":"#19d3f3","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Debbie","orientation":"v","showlegend":true,"x":[1936,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2016,2017,2021],"xaxis":"x","y":[5,9,9,10,16,11,32,74,91,115,120,191,233,300,427,697,902,1313,1656,1776,1675,1547,1458,1215,1004,648,504,415,338,279,243,192,145,108,108,92,72,64,87,91,81,65,79,67,74,64,56,71,78,93,85,78,50,61,70,53,46,39,22,28,19,11,16,14,13,8,21,10,11,10,12,8,9,6,5,5,5],"yaxis":"y","type":"scatter"},{"hovertemplate":"Name=Terri\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Terri","line":{"color":"#FF6692","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Terri","orientation":"v","showlegend":true,"x":[1938,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2004,2005,2006,2016,2021,2022],"xaxis":"x","y":[6,8,12,26,32,38,65,99,130,132,168,154,236,306,379,542,604,685,839,875,1052,964,937,902,826,737,486,448,398,323,312,263,191,153,120,106,81,59,84,57,44,49,47,53,44,36,37,35,32,34,20,26,29,15,19,22,11,15,12,13,11,14,9,7,6,7,5,5,5,5],"yaxis":"y","type":"scatter"},{"hovertemplate":"Name=Shannon\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Shannon","line":{"color":"#B6E880","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Shannon","orientation":"v","showlegend":true,"x":[1938,1939,1940,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022],"xaxis":"x","y":[6,9,6,10,14,19,25,16,34,23,34,43,51,59,73,83,111,106,126,129,161,145,206,216,305,409,441,516,587,932,1419,1650,1436,1198,1090,1127,982,1218,1136,1052,991,923,968,969,971,945,872,803,699,642,597,527,493,594,615,531,438,428,366,303,217,199,200,165,133,133,110,90,88,63,42,43,37,41,32,19,31,22,17,14,21,8,8,7],"yaxis":"y","type":"scatter"},{"hovertemplate":"Name=Debra\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Debra","line":{"color":"#FF97FF","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Debra","orientation":"v","showlegend":true,"x":[1940,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2012,2013,2016],"xaxis":"x","y":[7,7,8,15,19,20,56,92,199,601,1510,2351,3295,3784,3969,3755,3318,2660,2290,2014,1647,1592,1430,1287,1154,958,818,748,647,547,463,318,242,236,159,151,151,164,130,141,97,114,97,95,93,64,78,69,71,51,62,41,34,28,28,12,14,16,10,13,14,10,7,12,13,12,13,6,7,5,8,5],"yaxis":"y","type":"scatter"},{"hovertemplate":"Name=Tammy\u003cbr\u003eYear=%{x}\u003cbr\u003eCount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Tammy","line":{"color":"#FECB52","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines","name":"Tammy","orientation":"v","showlegend":true,"x":[1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2019,2022],"xaxis":"x","y":[7,5,10,9,12,13,10,9,9,13,28,14,26,37,368,746,990,1038,1136,1223,1539,1273,1219,1168,1143,1099,977,1013,859,704,544,421,392,328,275,229,227,181,168,157,96,120,102,85,120,88,85,94,77,82,74,61,49,45,45,54,50,47,49,45,44,36,30,24,29,14,16,12,11,9,5,13,9,15,11,7,5],"yaxis":"y","type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"Year"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Count"}},"legend":{"title":{"text":"Name"},"tracegroupgap":0}},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('40f51764-c72f-4590-b7e9-3c50d7776e99');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>As a quick exercise, consider what code would compute the total number of babies with each name.</p>
+<div id="726f2557" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="26">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>).head()</span>
+<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a><span class="co"># alternative solution: </span></span>
+<span id="cb38-3"><a href="#cb38-3" aria-hidden="true" tabindex="-1"></a><span class="co"># babynames.groupby("Name")[["Count"]].sum()</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/1912269730.py:1: FutureWarning:
+
+The provided callable &lt;built-in function sum&gt; is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="26">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadan</td>
+<td>18</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadarsh</td>
+<td>6</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aaden</td>
+<td>647</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Aadhav</td>
+<td>27</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Aadhini</td>
+<td>6</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="groupby-continued" class="level2" data-number="4.3">
+<h2 data-number="4.3" class="anchored" data-anchor-id="groupby-continued"><span class="header-section-number">4.3</span> <code>.groupby()</code>, Continued</h2>
+<p>We’ll work with the <code>elections</code> <code>DataFrame</code> again.</p>
+<div id="e3d31798" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="27">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb40"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb40-2"><a href="#cb40-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb40-3"><a href="#cb40-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb40-4"><a href="#cb40-4" aria-hidden="true" tabindex="-1"></a>elections <span class="op">=</span> pd.read_csv(<span class="st">"data/elections.csv"</span>)</span>
+<span id="cb40-5"><a href="#cb40-5" aria-hidden="true" tabindex="-1"></a>elections.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="27">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>702735</td>
+<td>win</td>
+<td>54.574789</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<section id="raw-groupby-objects" class="level3" data-number="4.3.1">
+<h3 data-number="4.3.1" class="anchored" data-anchor-id="raw-groupby-objects"><span class="header-section-number">4.3.1</span> Raw <code>GroupBy</code> Objects</h3>
+<p>The result of <code>groupby</code> applied to a <code>DataFrame</code> is a <code>DataFrameGroupBy</code> object, <strong>not</strong> a <code>DataFrame</code>.</p>
+<div id="f8a5f6a2" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="28">
+<div class="sourceCode cell-code" id="cb41"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a>grouped_by_year <span class="op">=</span> elections.groupby(<span class="st">"Year"</span>)</span>
+<span id="cb41-2"><a href="#cb41-2" aria-hidden="true" tabindex="-1"></a><span class="bu">type</span>(grouped_by_year)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="28">
+<pre><code>pandas.core.groupby.generic.DataFrameGroupBy</code></pre>
+</div>
+</div>
+<p>There are several ways to look into <code>DataFrameGroupBy</code> objects:</p>
+<div id="bb1e0b14" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="29">
+<div class="sourceCode cell-code" id="cb43"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a>grouped_by_party <span class="op">=</span> elections.groupby(<span class="st">"Party"</span>)</span>
+<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a>grouped_by_party.groups</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="29">
+<pre><code>{'American': [22, 126], 'American Independent': [115, 119, 124], 'Anti-Masonic': [6], 'Anti-Monopoly': [38], 'Citizens': [127], 'Communist': [89], 'Constitution': [160, 164, 172], 'Constitutional Union': [24], 'Democratic': [2, 4, 8, 10, 13, 14, 17, 20, 28, 29, 34, 37, 39, 45, 47, 52, 55, 57, 64, 70, 74, 77, 81, 83, 86, 91, 94, 97, 100, 105, 108, 111, 114, 116, 118, 123, 129, 134, 137, 140, 144, 151, 158, 162, 168, 176, 178], 'Democratic-Republican': [0, 1], 'Dixiecrat': [103], 'Farmer–Labor': [78], 'Free Soil': [15, 18], 'Green': [149, 155, 156, 165, 170, 177, 181], 'Greenback': [35], 'Independent': [121, 130, 143, 161, 167, 174], 'Liberal Republican': [31], 'Libertarian': [125, 128, 132, 138, 139, 146, 153, 159, 163, 169, 175, 180], 'National Democratic': [50], 'National Republican': [3, 5], 'National Union': [27], 'Natural Law': [148], 'New Alliance': [136], 'Northern Democratic': [26], 'Populist': [48, 61, 141], 'Progressive': [68, 82, 101, 107], 'Prohibition': [41, 44, 49, 51, 54, 59, 63, 67, 73, 75, 99], 'Reform': [150, 154], 'Republican': [21, 23, 30, 32, 33, 36, 40, 43, 46, 53, 56, 60, 65, 69, 72, 79, 80, 84, 87, 90, 96, 98, 104, 106, 109, 112, 113, 117, 120, 122, 131, 133, 135, 142, 145, 152, 157, 166, 171, 173, 179], 'Socialist': [58, 62, 66, 71, 76, 85, 88, 92, 95, 102], 'Southern Democratic': [25], 'States' Rights': [110], 'Taxpayers': [147], 'Union': [93], 'Union Labor': [42], 'Whig': [7, 9, 11, 12, 16, 19]}</code></pre>
+</div>
+</div>
+<div id="785c9078" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="30">
+<div class="sourceCode cell-code" id="cb45"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb45-1"><a href="#cb45-1" aria-hidden="true" tabindex="-1"></a>grouped_by_party.get_group(<span class="st">"Socialist"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="30">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">58</td>
+<td>1904</td>
+<td>Eugene V. Debs</td>
+<td>Socialist</td>
+<td>402810</td>
+<td>loss</td>
+<td>2.985897</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">62</td>
+<td>1908</td>
+<td>Eugene V. Debs</td>
+<td>Socialist</td>
+<td>420852</td>
+<td>loss</td>
+<td>2.850866</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">66</td>
+<td>1912</td>
+<td>Eugene V. Debs</td>
+<td>Socialist</td>
+<td>901551</td>
+<td>loss</td>
+<td>6.004354</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">71</td>
+<td>1916</td>
+<td>Allan L. Benson</td>
+<td>Socialist</td>
+<td>590524</td>
+<td>loss</td>
+<td>3.194193</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">76</td>
+<td>1920</td>
+<td>Eugene V. Debs</td>
+<td>Socialist</td>
+<td>913693</td>
+<td>loss</td>
+<td>3.428282</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">85</td>
+<td>1928</td>
+<td>Norman Thomas</td>
+<td>Socialist</td>
+<td>267478</td>
+<td>loss</td>
+<td>0.728623</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">88</td>
+<td>1932</td>
+<td>Norman Thomas</td>
+<td>Socialist</td>
+<td>884885</td>
+<td>loss</td>
+<td>2.236211</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">92</td>
+<td>1936</td>
+<td>Norman Thomas</td>
+<td>Socialist</td>
+<td>187910</td>
+<td>loss</td>
+<td>0.412876</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">95</td>
+<td>1940</td>
+<td>Norman Thomas</td>
+<td>Socialist</td>
+<td>116599</td>
+<td>loss</td>
+<td>0.234237</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">102</td>
+<td>1948</td>
+<td>Norman Thomas</td>
+<td>Socialist</td>
+<td>139569</td>
+<td>loss</td>
+<td>0.286312</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="other-groupby-methods" class="level3" data-number="4.3.2">
+<h3 data-number="4.3.2" class="anchored" data-anchor-id="other-groupby-methods"><span class="header-section-number">4.3.2</span> Other <code>GroupBy</code> Methods</h3>
+<p>There are many aggregation methods we can use with <code>.agg</code>. Some useful options are:</p>
+<ul>
+<li><a href="https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.mean.html#pandas.core.groupby.DataFrameGroupBy.mean"><code>.mean</code></a>: creates a new <code>DataFrame</code> with the mean value of each group</li>
+<li><a href="https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.sum.html#pandas.core.groupby.DataFrameGroupBy.sum"><code>.sum</code></a>: creates a new <code>DataFrame</code> with the sum of each group</li>
+<li><a href="https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.max.html#pandas.core.groupby.DataFrameGroupBy.max"><code>.max</code></a> and <a href="https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.min.html#pandas.core.groupby.DataFrameGroupBy.min"><code>.min</code></a>: creates a new <code>DataFrame</code> with the maximum/minimum value of each group</li>
+<li><a href="https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.first.html#pandas.core.groupby.DataFrameGroupBy.first"><code>.first</code></a> and <a href="https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.last.html#pandas.core.groupby.DataFrameGroupBy.last"><code>.last</code></a>: creates a new <code>DataFrame</code> with the first/last row in each group</li>
+<li><a href="https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.size.html#pandas.core.groupby.DataFrameGroupBy.size"><code>.size</code></a>: creates a new <strong><code>Series</code></strong> with the number of entries in each group</li>
+<li><a href="https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.count.html#pandas.core.groupby.DataFrameGroupBy.count"><code>.count</code></a>: creates a new <strong><code>DataFrame</code></strong> with the number of entries, excluding missing values.</li>
+</ul>
+<p>Let’s illustrate some examples by creating a <code>DataFrame</code> called <code>df</code>.</p>
+<div id="8080e84b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="31">
+<div class="sourceCode cell-code" id="cb46"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> pd.DataFrame({<span class="st">'letter'</span>:[<span class="st">'A'</span>,<span class="st">'A'</span>,<span class="st">'B'</span>,<span class="st">'C'</span>,<span class="st">'C'</span>,<span class="st">'C'</span>], </span>
+<span id="cb46-2"><a href="#cb46-2" aria-hidden="true" tabindex="-1"></a>                   <span class="st">'num'</span>:[<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">4</span>,np.nan,<span class="dv">4</span>], </span>
+<span id="cb46-3"><a href="#cb46-3" aria-hidden="true" tabindex="-1"></a>                   <span class="st">'state'</span>:[np.nan, <span class="st">'tx'</span>, <span class="st">'fl'</span>, <span class="st">'hi'</span>, np.nan, <span class="st">'ak'</span>]})</span>
+<span id="cb46-4"><a href="#cb46-4" aria-hidden="true" tabindex="-1"></a>df</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="31">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">letter</th>
+<th data-quarto-table-cell-role="th">num</th>
+<th data-quarto-table-cell-role="th">state</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>A</td>
+<td>1.0</td>
+<td>NaN</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>A</td>
+<td>2.0</td>
+<td>tx</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>B</td>
+<td>3.0</td>
+<td>fl</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>C</td>
+<td>4.0</td>
+<td>hi</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>C</td>
+<td>NaN</td>
+<td>NaN</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">5</td>
+<td>C</td>
+<td>4.0</td>
+<td>ak</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Note the slight difference between <code>.size()</code> and <code>.count()</code>: while <code>.size()</code> returns a <code>Series</code> and counts the number of entries including the missing values, <code>.count()</code> returns a <code>DataFrame</code> and counts the number of entries in each column <em>excluding missing values</em>.</p>
+<div id="9f38b0ad" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="32">
+<div class="sourceCode cell-code" id="cb47"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a>df.groupby(<span class="st">"letter"</span>).size()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="32">
+<pre><code>letter
+A    2
+B    1
+C    3
+dtype: int64</code></pre>
+</div>
+</div>
+<div id="aa3532c7" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="33">
+<div class="sourceCode cell-code" id="cb49"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a>df.groupby(<span class="st">"letter"</span>).count()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="33">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">num</th>
+<th data-quarto-table-cell-role="th">state</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">letter</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">A</td>
+<td>2</td>
+<td>1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">B</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">C</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>You might recall that the <code>value_counts()</code> function in the previous note does something similar. It turns out <code>value_counts()</code> and <code>groupby.size()</code> are the same, except <code>value_counts()</code> sorts the resulting <code>Series</code> in descending order automatically.</p>
+<div id="361f1b32" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="34">
+<div class="sourceCode cell-code" id="cb50"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb50-1"><a href="#cb50-1" aria-hidden="true" tabindex="-1"></a>df[<span class="st">"letter"</span>].value_counts()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="34">
+<pre><code>letter
+C    3
+A    2
+B    1
+Name: count, dtype: int64</code></pre>
+</div>
+</div>
+<p>These (and other) aggregation functions are so common that <code>pandas</code> allows for writing shorthand. Instead of explicitly stating the use of <code>.agg</code>, we can call the function directly on the <code>GroupBy</code> object.</p>
+<p>For example, the following are equivalent:</p>
+<ul>
+<li><code>elections.groupby("Candidate").agg(mean)</code></li>
+<li><code>elections.groupby("Candidate").mean()</code></li>
+</ul>
+<p>There are many other methods that <code>pandas</code> supports. You can check them out on the <a href="https://pandas.pydata.org/docs/reference/groupby.html"><code>pandas</code> documentation</a>.</p>
+</section>
+<section id="filtering-by-group" class="level3" data-number="4.3.3">
+<h3 data-number="4.3.3" class="anchored" data-anchor-id="filtering-by-group"><span class="header-section-number">4.3.3</span> Filtering by Group</h3>
+<p>Another common use for <code>GroupBy</code> objects is to filter data by group.</p>
+<p><code>groupby.filter</code> takes an argument <code>func</code>, where <code>func</code> is a function that:</p>
+<ul>
+<li>Takes a <code>DataFrame</code> object as input</li>
+<li>Returns a single <code>True</code> or <code>False</code>.</li>
+</ul>
+<p><code>groupby.filter</code> applies <code>func</code> to each group/sub-<code>DataFrame</code>:</p>
+<ul>
+<li>If <code>func</code> returns <code>True</code> for a group, then all rows belonging to the group are preserved.</li>
+<li>If <code>func</code> returns <code>False</code> for a group, then all rows belonging to that group are filtered out.</li>
+</ul>
+<p>In other words, sub-<code>DataFrame</code>s that correspond to <code>True</code> are returned in the final result, whereas those with a <code>False</code> value are not. Importantly, <code>groupby.filter</code> is different from <code>groupby.agg</code> in that an <em>entire</em> sub-<code>DataFrame</code> is returned in the final <code>DataFrame</code>, not just a single row. As a result, <code>groupby.filter</code> preserves the original indices and the column we grouped on does <strong>NOT</strong> become the index!</p>
+<p><img src="images/filter_demo.png" alt="groupby_demo" width="600"></p>
+<p>To illustrate how this happens, let’s go back to the <code>elections</code> dataset. Say we want to identify “tight” election years – that is, we want to find all rows that correspond to election years where all candidates in that year won a similar portion of the total vote. Specifically, let’s find all rows corresponding to a year where no candidate won more than 45% of the total vote.</p>
+<p>In other words, we want to:</p>
+<ul>
+<li>Find the years where the maximum <code>%</code> in that year is less than 45%</li>
+<li>Return all <code>DataFrame</code> rows that correspond to these years</li>
+</ul>
+<p>For each year, we need to find the maximum <code>%</code> among <em>all</em> rows for that year. If this maximum <code>%</code> is lower than 45%, we will tell <code>pandas</code> to keep all rows corresponding to that year.</p>
+<div id="b50f5ef9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="35">
+<div class="sourceCode cell-code" id="cb52"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a>elections.groupby(<span class="st">"Year"</span>).<span class="bu">filter</span>(<span class="kw">lambda</span> sf: sf[<span class="st">"%"</span>].<span class="bu">max</span>() <span class="op">&lt;</span> <span class="dv">45</span>).head(<span class="dv">9</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="35">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">23</td>
+<td>1860</td>
+<td>Abraham Lincoln</td>
+<td>Republican</td>
+<td>1855993</td>
+<td>win</td>
+<td>39.699408</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">24</td>
+<td>1860</td>
+<td>John Bell</td>
+<td>Constitutional Union</td>
+<td>590901</td>
+<td>loss</td>
+<td>12.639283</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">25</td>
+<td>1860</td>
+<td>John C. Breckinridge</td>
+<td>Southern Democratic</td>
+<td>848019</td>
+<td>loss</td>
+<td>18.138998</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">26</td>
+<td>1860</td>
+<td>Stephen A. Douglas</td>
+<td>Northern Democratic</td>
+<td>1380202</td>
+<td>loss</td>
+<td>29.522311</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">66</td>
+<td>1912</td>
+<td>Eugene V. Debs</td>
+<td>Socialist</td>
+<td>901551</td>
+<td>loss</td>
+<td>6.004354</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">67</td>
+<td>1912</td>
+<td>Eugene W. Chafin</td>
+<td>Prohibition</td>
+<td>208156</td>
+<td>loss</td>
+<td>1.386325</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">68</td>
+<td>1912</td>
+<td>Theodore Roosevelt</td>
+<td>Progressive</td>
+<td>4122721</td>
+<td>loss</td>
+<td>27.457433</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">69</td>
+<td>1912</td>
+<td>William Taft</td>
+<td>Republican</td>
+<td>3486242</td>
+<td>loss</td>
+<td>23.218466</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">70</td>
+<td>1912</td>
+<td>Woodrow Wilson</td>
+<td>Democratic</td>
+<td>6296284</td>
+<td>win</td>
+<td>41.933422</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>What’s going on here? In this example, we’ve defined our filtering function, <code>func</code>, to be <code>lambda sf: sf["%"].max() &lt; 45</code>. This filtering function will find the maximum <code>"%"</code> value among all entries in the grouped sub-<code>DataFrame</code>, which we call <code>sf</code>. If the maximum value is less than 45, then the filter function will return <code>True</code> and all rows in that grouped sub-<code>DataFrame</code> will appear in the final output <code>DataFrame</code>.</p>
+<p>Examine the <code>DataFrame</code> above. Notice how, in this preview of the first 9 rows, all entries from the years 1860 and 1912 appear. This means that in 1860 and 1912, no candidate in that year won more than 45% of the total vote.</p>
+<p>You may ask: how is the <code>groupby.filter</code> procedure different to the boolean filtering we’ve seen previously? Boolean filtering considers <em>individual</em> rows when applying a boolean condition. For example, the code <code>elections[elections["%"] &lt; 45]</code> will check the <code>"%"</code> value of every single row in <code>elections</code>; if it is less than 45, then that row will be kept in the output. <code>groupby.filter</code>, in contrast, applies a boolean condition <em>across</em> all rows in a group. If not all rows in that group satisfy the condition specified by the filter, the entire group will be discarded in the output.</p>
+</section>
+<section id="aggregation-with-lambda-functions" class="level3" data-number="4.3.4">
+<h3 data-number="4.3.4" class="anchored" data-anchor-id="aggregation-with-lambda-functions"><span class="header-section-number">4.3.4</span> Aggregation with <code>lambda</code> Functions</h3>
+<p>What if we wish to aggregate our <code>DataFrame</code> using a non-standard function – for example, a function of our own design? We can do so by combining <code>.agg</code> with <code>lambda</code> expressions.</p>
+<p>Let’s first consider a puzzle to jog our memory. We will attempt to find the <code>Candidate</code> from each <code>Party</code> with the highest <code>%</code> of votes.</p>
+<p>A naive approach may be to group by the <code>Party</code> column and aggregate by the maximum.</p>
+<div id="ebc6b45b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="36">
+<div class="sourceCode cell-code" id="cb53"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb53-1"><a href="#cb53-1" aria-hidden="true" tabindex="-1"></a>elections.groupby(<span class="st">"Party"</span>).agg(<span class="bu">max</span>).head(<span class="dv">10</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/4278286395.py:1: FutureWarning:
+
+The provided callable &lt;built-in function max&gt; is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="36">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">American</td>
+<td>1976</td>
+<td>Thomas J. Anderson</td>
+<td>873053</td>
+<td>loss</td>
+<td>21.554001</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">American Independent</td>
+<td>1976</td>
+<td>Lester Maddox</td>
+<td>9901118</td>
+<td>loss</td>
+<td>13.571218</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Anti-Masonic</td>
+<td>1832</td>
+<td>William Wirt</td>
+<td>100715</td>
+<td>loss</td>
+<td>7.821583</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Anti-Monopoly</td>
+<td>1884</td>
+<td>Benjamin Butler</td>
+<td>134294</td>
+<td>loss</td>
+<td>1.335838</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Citizens</td>
+<td>1980</td>
+<td>Barry Commoner</td>
+<td>233052</td>
+<td>loss</td>
+<td>0.270182</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Communist</td>
+<td>1932</td>
+<td>William Z. Foster</td>
+<td>103307</td>
+<td>loss</td>
+<td>0.261069</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Constitution</td>
+<td>2016</td>
+<td>Michael Peroutka</td>
+<td>203091</td>
+<td>loss</td>
+<td>0.152398</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Constitutional Union</td>
+<td>1860</td>
+<td>John Bell</td>
+<td>590901</td>
+<td>loss</td>
+<td>12.639283</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Democratic</td>
+<td>2020</td>
+<td>Woodrow Wilson</td>
+<td>81268924</td>
+<td>win</td>
+<td>61.344703</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Democratic-Republican</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>151271</td>
+<td>win</td>
+<td>57.210122</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>This approach is clearly wrong – the <code>DataFrame</code> claims that Woodrow Wilson won the presidency in 2020.</p>
+<p>Why is this happening? Here, the <code>max</code> aggregation function is taken over every column <em>independently</em>. Among Democrats, <code>max</code> is computing:</p>
+<ul>
+<li>The most recent <code>Year</code> a Democratic candidate ran for president (2020)</li>
+<li>The <code>Candidate</code> with the alphabetically “largest” name (“Woodrow Wilson”)</li>
+<li>The <code>Result</code> with the alphabetically “largest” outcome (“win”)</li>
+</ul>
+<p>Instead, let’s try a different approach. We will:</p>
+<ol type="1">
+<li>Sort the <code>DataFrame</code> so that rows are in descending order of <code>%</code></li>
+<li>Group by <code>Party</code> and select the first row of each sub-<code>DataFrame</code></li>
+</ol>
+<p>While it may seem unintuitive, sorting <code>elections</code> by descending order of <code>%</code> is extremely helpful. If we then group by <code>Party</code>, the first row of each <code>GroupBy</code> object will contain information about the <code>Candidate</code> with the highest voter <code>%</code>.</p>
+<div id="698171a4" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="37">
+<div class="sourceCode cell-code" id="cb55"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb55-1"><a href="#cb55-1" aria-hidden="true" tabindex="-1"></a>elections_sorted_by_percent <span class="op">=</span> elections.sort_values(<span class="st">"%"</span>, ascending<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb55-2"><a href="#cb55-2" aria-hidden="true" tabindex="-1"></a>elections_sorted_by_percent.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="37">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">114</td>
+<td>1964</td>
+<td>Lyndon Johnson</td>
+<td>Democratic</td>
+<td>43127041</td>
+<td>win</td>
+<td>61.344703</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">91</td>
+<td>1936</td>
+<td>Franklin Roosevelt</td>
+<td>Democratic</td>
+<td>27752648</td>
+<td>win</td>
+<td>60.978107</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">120</td>
+<td>1972</td>
+<td>Richard Nixon</td>
+<td>Republican</td>
+<td>47168710</td>
+<td>win</td>
+<td>60.907806</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">79</td>
+<td>1920</td>
+<td>Warren Harding</td>
+<td>Republican</td>
+<td>16144093</td>
+<td>win</td>
+<td>60.574501</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">133</td>
+<td>1984</td>
+<td>Ronald Reagan</td>
+<td>Republican</td>
+<td>54455472</td>
+<td>win</td>
+<td>59.023326</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="d2d721ba" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="38">
+<div class="sourceCode cell-code" id="cb56"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a>elections_sorted_by_percent.groupby(<span class="st">"Party"</span>).agg(<span class="kw">lambda</span> x : x.iloc[<span class="dv">0</span>]).head(<span class="dv">10</span>)</span>
+<span id="cb56-2"><a href="#cb56-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb56-3"><a href="#cb56-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Equivalent to the below code</span></span>
+<span id="cb56-4"><a href="#cb56-4" aria-hidden="true" tabindex="-1"></a><span class="co"># elections_sorted_by_percent.groupby("Party").agg('first').head(10)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="38">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">American</td>
+<td>1856</td>
+<td>Millard Fillmore</td>
+<td>873053</td>
+<td>loss</td>
+<td>21.554001</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">American Independent</td>
+<td>1968</td>
+<td>George Wallace</td>
+<td>9901118</td>
+<td>loss</td>
+<td>13.571218</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Anti-Masonic</td>
+<td>1832</td>
+<td>William Wirt</td>
+<td>100715</td>
+<td>loss</td>
+<td>7.821583</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Anti-Monopoly</td>
+<td>1884</td>
+<td>Benjamin Butler</td>
+<td>134294</td>
+<td>loss</td>
+<td>1.335838</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Citizens</td>
+<td>1980</td>
+<td>Barry Commoner</td>
+<td>233052</td>
+<td>loss</td>
+<td>0.270182</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Communist</td>
+<td>1932</td>
+<td>William Z. Foster</td>
+<td>103307</td>
+<td>loss</td>
+<td>0.261069</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Constitution</td>
+<td>2008</td>
+<td>Chuck Baldwin</td>
+<td>199750</td>
+<td>loss</td>
+<td>0.152398</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Constitutional Union</td>
+<td>1860</td>
+<td>John Bell</td>
+<td>590901</td>
+<td>loss</td>
+<td>12.639283</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">Democratic</td>
+<td>1964</td>
+<td>Lyndon Johnson</td>
+<td>43127041</td>
+<td>win</td>
+<td>61.344703</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">Democratic-Republican</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Here’s an illustration of the process:</p>
+<p><img src="images/puzzle_demo.png" alt="groupby_demo" width="600"></p>
+<p>Notice how our code correctly determines that Lyndon Johnson from the Democratic Party has the highest voter <code>%</code>.</p>
+<p>More generally, <code>lambda</code> functions are used to design custom aggregation functions that aren’t pre-defined by Python. The input parameter <code>x</code> to the <code>lambda</code> function is a <code>GroupBy</code> object. Therefore, it should make sense why <code>lambda x : x.iloc[0]</code> selects the first row in each groupby object.</p>
+<p>In fact, there’s a few different ways to approach this problem. Each approach has different tradeoffs in terms of readability, performance, memory consumption, complexity, etc. We’ve given a few examples below.</p>
+<p><strong>Note</strong>: Understanding these alternative solutions is not required. They are given to demonstrate the vast number of problem-solving approaches in <code>pandas</code>.</p>
+<div id="e27d90d7" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="39">
+<div class="sourceCode cell-code" id="cb57"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb57-1"><a href="#cb57-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Using the idxmax function</span></span>
+<span id="cb57-2"><a href="#cb57-2" aria-hidden="true" tabindex="-1"></a>best_per_party <span class="op">=</span> elections.loc[elections.groupby(<span class="st">'Party'</span>)[<span class="st">'%'</span>].idxmax()]</span>
+<span id="cb57-3"><a href="#cb57-3" aria-hidden="true" tabindex="-1"></a>best_per_party.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="39">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">22</td>
+<td>1856</td>
+<td>Millard Fillmore</td>
+<td>American</td>
+<td>873053</td>
+<td>loss</td>
+<td>21.554001</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">115</td>
+<td>1968</td>
+<td>George Wallace</td>
+<td>American Independent</td>
+<td>9901118</td>
+<td>loss</td>
+<td>13.571218</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">6</td>
+<td>1832</td>
+<td>William Wirt</td>
+<td>Anti-Masonic</td>
+<td>100715</td>
+<td>loss</td>
+<td>7.821583</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">38</td>
+<td>1884</td>
+<td>Benjamin Butler</td>
+<td>Anti-Monopoly</td>
+<td>134294</td>
+<td>loss</td>
+<td>1.335838</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">127</td>
+<td>1980</td>
+<td>Barry Commoner</td>
+<td>Citizens</td>
+<td>233052</td>
+<td>loss</td>
+<td>0.270182</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="4419ac47" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="40">
+<div class="sourceCode cell-code" id="cb58"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb58-1"><a href="#cb58-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Using the .drop_duplicates function</span></span>
+<span id="cb58-2"><a href="#cb58-2" aria-hidden="true" tabindex="-1"></a>best_per_party2 <span class="op">=</span> elections.sort_values(<span class="st">'%'</span>).drop_duplicates([<span class="st">'Party'</span>], keep<span class="op">=</span><span class="st">'last'</span>)</span>
+<span id="cb58-3"><a href="#cb58-3" aria-hidden="true" tabindex="-1"></a>best_per_party2.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="40">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">148</td>
+<td>1996</td>
+<td>John Hagelin</td>
+<td>Natural Law</td>
+<td>113670</td>
+<td>loss</td>
+<td>0.118219</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">164</td>
+<td>2008</td>
+<td>Chuck Baldwin</td>
+<td>Constitution</td>
+<td>199750</td>
+<td>loss</td>
+<td>0.152398</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">110</td>
+<td>1956</td>
+<td>T. Coleman Andrews</td>
+<td>States' Rights</td>
+<td>107929</td>
+<td>loss</td>
+<td>0.174883</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">147</td>
+<td>1996</td>
+<td>Howard Phillips</td>
+<td>Taxpayers</td>
+<td>184656</td>
+<td>loss</td>
+<td>0.192045</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">136</td>
+<td>1988</td>
+<td>Lenora Fulani</td>
+<td>New Alliance</td>
+<td>217221</td>
+<td>loss</td>
+<td>0.237804</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="aggregating-data-with-pivot-tables" class="level2" data-number="4.4">
+<h2 data-number="4.4" class="anchored" data-anchor-id="aggregating-data-with-pivot-tables"><span class="header-section-number">4.4</span> Aggregating Data with Pivot Tables</h2>
+<p>We know now that <code>.groupby</code> gives us the ability to group and aggregate data across our <code>DataFrame</code>. The examples above formed groups using just one column in the <code>DataFrame</code>. It’s possible to group by multiple columns at once by passing in a list of column names to <code>.groupby</code>.</p>
+<p>Let’s consider the <code>babynames</code> dataset again. In this problem, we will find the total number of baby names associated with each sex for each year. To do this, we’ll group by <em>both</em> the <code>"Year"</code> and <code>"Sex"</code> columns.</p>
+<div id="c3415212" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="41">
+<div class="sourceCode cell-code" id="cb59"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb59-1"><a href="#cb59-1" aria-hidden="true" tabindex="-1"></a>babynames.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="41">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+<th data-quarto-table-cell-role="th">First Letter</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">115957</td>
+<td>CA</td>
+<td>F</td>
+<td>1990</td>
+<td>Deandrea</td>
+<td>5</td>
+<td>D</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">101976</td>
+<td>CA</td>
+<td>F</td>
+<td>1986</td>
+<td>Deandrea</td>
+<td>6</td>
+<td>D</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">131029</td>
+<td>CA</td>
+<td>F</td>
+<td>1994</td>
+<td>Leandrea</td>
+<td>5</td>
+<td>L</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">108731</td>
+<td>CA</td>
+<td>F</td>
+<td>1988</td>
+<td>Deandrea</td>
+<td>5</td>
+<td>D</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">308131</td>
+<td>CA</td>
+<td>M</td>
+<td>1985</td>
+<td>Deandrea</td>
+<td>6</td>
+<td>D</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="37dca34b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="42">
+<div class="sourceCode cell-code" id="cb60"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb60-1"><a href="#cb60-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Find the total number of baby names associated with each sex for each </span></span>
+<span id="cb60-2"><a href="#cb60-2" aria-hidden="true" tabindex="-1"></a><span class="co"># year in the data</span></span>
+<span id="cb60-3"><a href="#cb60-3" aria-hidden="true" tabindex="-1"></a>babynames.groupby([<span class="st">"Year"</span>, <span class="st">"Sex"</span>])[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>).head(<span class="dv">6</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/3186035650.py:3: FutureWarning:
+
+The provided callable &lt;built-in function sum&gt; is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="42">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Count</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td rowspan="2" data-quarto-table-cell-role="th" data-valign="top">1910</td>
+<td data-quarto-table-cell-role="th">F</td>
+<td>5950</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">M</td>
+<td>3213</td>
+</tr>
+<tr class="odd">
+<td rowspan="2" data-quarto-table-cell-role="th" data-valign="top">1911</td>
+<td data-quarto-table-cell-role="th">F</td>
+<td>6602</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">M</td>
+<td>3381</td>
+</tr>
+<tr class="odd">
+<td rowspan="2" data-quarto-table-cell-role="th" data-valign="top">1912</td>
+<td data-quarto-table-cell-role="th">F</td>
+<td>9804</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">M</td>
+<td>8142</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Notice that both <code>"Year"</code> and <code>"Sex"</code> serve as the index of the <code>DataFrame</code> (they are both rendered in bold). We’ve created a <em>multi-index</em> <code>DataFrame</code> where two different index values, the year and sex, are used to uniquely identify each row.</p>
+<p>This isn’t the most intuitive way of representing this data – and, because multi-indexed DataFrames have multiple dimensions in their index, they can often be difficult to use.</p>
+<p>Another strategy to aggregate across two columns is to create a pivot table. You saw these back in <a href="https://inferentialthinking.com/chapters/08/3/Cross-Classifying_by_More_than_One_Variable.html#pivot-tables-rearranging-the-output-of-group">Data 8</a>. One set of values is used to create the index of the pivot table; another set is used to define the column names. The values contained in each cell of the table correspond to the aggregated data for each index-column pair.</p>
+<p>Here’s an illustration of the process:</p>
+<p><img src="images/pivot.png" alt="groupby_demo" width="600"></p>
+<p>The best way to understand pivot tables is to see one in action. Let’s return to our original goal of summing the total number of names associated with each combination of year and sex. We’ll call the <code>pandas</code> <a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html"><code>.pivot_table</code></a> method to create a new table.</p>
+<div id="99933c6f" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="43">
+<div class="sourceCode cell-code" id="cb62"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb62-1"><a href="#cb62-1" aria-hidden="true" tabindex="-1"></a><span class="co"># The `pivot_table` method is used to generate a Pandas pivot table</span></span>
+<span id="cb62-2"><a href="#cb62-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb62-3"><a href="#cb62-3" aria-hidden="true" tabindex="-1"></a>babynames.pivot_table(</span>
+<span id="cb62-4"><a href="#cb62-4" aria-hidden="true" tabindex="-1"></a>    index <span class="op">=</span> <span class="st">"Year"</span>,</span>
+<span id="cb62-5"><a href="#cb62-5" aria-hidden="true" tabindex="-1"></a>    columns <span class="op">=</span> <span class="st">"Sex"</span>,    </span>
+<span id="cb62-6"><a href="#cb62-6" aria-hidden="true" tabindex="-1"></a>    values <span class="op">=</span> <span class="st">"Count"</span>, </span>
+<span id="cb62-7"><a href="#cb62-7" aria-hidden="true" tabindex="-1"></a>    aggfunc <span class="op">=</span> np.<span class="bu">sum</span>, </span>
+<span id="cb62-8"><a href="#cb62-8" aria-hidden="true" tabindex="-1"></a>).head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/2548053048.py:3: FutureWarning:
+
+The provided callable &lt;function sum at 0x103984160&gt; is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="43">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">F</th>
+<th data-quarto-table-cell-role="th">M</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1910</td>
+<td>5950</td>
+<td>3213</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1911</td>
+<td>6602</td>
+<td>3381</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1912</td>
+<td>9804</td>
+<td>8142</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1913</td>
+<td>11860</td>
+<td>10234</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1914</td>
+<td>13815</td>
+<td>13111</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Looks a lot better! Now, our <code>DataFrame</code> is structured with clear index-column combinations. Each entry in the pivot table represents the summed count of names for a given combination of <code>"Year"</code> and <code>"Sex"</code>.</p>
+<p>Let’s take a closer look at the code implemented above.</p>
+<ul>
+<li><code>index = "Year"</code> specifies the column name in the original <code>DataFrame</code> that should be used as the index of the pivot table</li>
+<li><code>columns = "Sex"</code> specifies the column name in the original <code>DataFrame</code> that should be used to generate the columns of the pivot table</li>
+<li><code>values = "Count"</code> indicates what values from the original <code>DataFrame</code> should be used to populate the entry for each index-column combination</li>
+<li><code>aggfunc = np.sum</code> tells <code>pandas</code> what function to use when aggregating the data specified by <code>values</code>. Here, we are summing the name counts for each pair of <code>"Year"</code> and <code>"Sex"</code></li>
+</ul>
+<p>We can even include multiple values in the index or columns of our pivot tables.</p>
+<div id="3e67b2f3" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="44">
+<div class="sourceCode cell-code" id="cb64"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb64-1"><a href="#cb64-1" aria-hidden="true" tabindex="-1"></a>babynames_pivot <span class="op">=</span> babynames.pivot_table(</span>
+<span id="cb64-2"><a href="#cb64-2" aria-hidden="true" tabindex="-1"></a>    index<span class="op">=</span><span class="st">"Year"</span>,     <span class="co"># the rows (turned into index)</span></span>
+<span id="cb64-3"><a href="#cb64-3" aria-hidden="true" tabindex="-1"></a>    columns<span class="op">=</span><span class="st">"Sex"</span>,    <span class="co"># the column values</span></span>
+<span id="cb64-4"><a href="#cb64-4" aria-hidden="true" tabindex="-1"></a>    values<span class="op">=</span>[<span class="st">"Count"</span>, <span class="st">"Name"</span>], </span>
+<span id="cb64-5"><a href="#cb64-5" aria-hidden="true" tabindex="-1"></a>    aggfunc<span class="op">=</span><span class="bu">max</span>,      <span class="co"># group operation</span></span>
+<span id="cb64-6"><a href="#cb64-6" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb64-7"><a href="#cb64-7" aria-hidden="true" tabindex="-1"></a>babynames_pivot.head(<span class="dv">6</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57856/970182367.py:1: FutureWarning:
+
+The provided callable &lt;built-in function max&gt; is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.
+</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="44">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th colspan="2" data-quarto-table-cell-role="th" data-halign="left">Count</th>
+<th colspan="2" data-quarto-table-cell-role="th" data-halign="left">Name</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">F</th>
+<th data-quarto-table-cell-role="th">M</th>
+<th data-quarto-table-cell-role="th">F</th>
+<th data-quarto-table-cell-role="th">M</th>
+</tr>
+<tr class="header">
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1910</td>
+<td>295</td>
+<td>237</td>
+<td>Yvonne</td>
+<td>William</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1911</td>
+<td>390</td>
+<td>214</td>
+<td>Zelma</td>
+<td>Willis</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1912</td>
+<td>534</td>
+<td>501</td>
+<td>Yvonne</td>
+<td>Woodrow</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1913</td>
+<td>584</td>
+<td>614</td>
+<td>Zelma</td>
+<td>Yoshio</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1914</td>
+<td>773</td>
+<td>769</td>
+<td>Zelma</td>
+<td>Yoshio</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1915</td>
+<td>998</td>
+<td>1033</td>
+<td>Zita</td>
+<td>Yukio</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Note that each row provides the number of girls and number of boys having that year’s most common name, and also lists the alphabetically largest girl name and boy name. The counts for number of girls/boys in the resulting <code>DataFrame</code> do not correspond to the names listed. For example, in 1910, the most popular girl name is given to 295 girls, but that name was likely not Yvonne.</p>
+</section>
+<section id="joining-tables" class="level2" data-number="4.5">
+<h2 data-number="4.5" class="anchored" data-anchor-id="joining-tables"><span class="header-section-number">4.5</span> Joining Tables</h2>
+<p>When working on data science projects, we’re unlikely to have absolutely all the data we want contained in a single <code>DataFrame</code> – a real-world data scientist needs to grapple with data coming from multiple sources. If we have access to multiple datasets with related information, we can join two or more tables into a single <code>DataFrame</code>.</p>
+<p>To put this into practice, we’ll revisit the <code>elections</code> dataset.</p>
+<div id="41e3dee8" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="45">
+<div class="sourceCode cell-code" id="cb66"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb66-1"><a href="#cb66-1" aria-hidden="true" tabindex="-1"></a>elections.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="45">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>702735</td>
+<td>win</td>
+<td>54.574789</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Say we want to understand the popularity of the names of each presidential candidate in 2022. To do this, we’ll need the combined data of <code>babynames</code> <em>and</em> <code>elections</code>.</p>
+<p>We’ll start by creating a new column containing the first name of each presidential candidate. This will help us join each name in <code>elections</code> to the corresponding name data in <code>babynames</code>.</p>
+<div id="1cd56d63" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="46">
+<div class="sourceCode cell-code" id="cb67"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb67-1"><a href="#cb67-1" aria-hidden="true" tabindex="-1"></a><span class="co"># This `str` operation splits each candidate's full name at each </span></span>
+<span id="cb67-2"><a href="#cb67-2" aria-hidden="true" tabindex="-1"></a><span class="co"># blank space, then takes just the candidate's first name</span></span>
+<span id="cb67-3"><a href="#cb67-3" aria-hidden="true" tabindex="-1"></a>elections[<span class="st">"First Name"</span>] <span class="op">=</span> elections[<span class="st">"Candidate"</span>].<span class="bu">str</span>.split().<span class="bu">str</span>[<span class="dv">0</span>]</span>
+<span id="cb67-4"><a href="#cb67-4" aria-hidden="true" tabindex="-1"></a>elections.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="46">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+<th data-quarto-table-cell-role="th">First Name</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+<td>Andrew</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+<td>John</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+<td>Andrew</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+<td>John</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>702735</td>
+<td>win</td>
+<td>54.574789</td>
+<td>Andrew</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="22fdcbce" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="47">
+<div class="sourceCode cell-code" id="cb68"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb68-1"><a href="#cb68-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Here, we'll only consider `babynames` data from 2022</span></span>
+<span id="cb68-2"><a href="#cb68-2" aria-hidden="true" tabindex="-1"></a>babynames_2022 <span class="op">=</span> babynames[babynames[<span class="st">"Year"</span>]<span class="op">==</span><span class="dv">2022</span>]</span>
+<span id="cb68-3"><a href="#cb68-3" aria-hidden="true" tabindex="-1"></a>babynames_2022.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="47">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+<th data-quarto-table-cell-role="th">First Letter</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">237964</td>
+<td>CA</td>
+<td>F</td>
+<td>2022</td>
+<td>Leandra</td>
+<td>10</td>
+<td>L</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">404916</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Leandro</td>
+<td>99</td>
+<td>L</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">405892</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Andreas</td>
+<td>14</td>
+<td>A</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">235927</td>
+<td>CA</td>
+<td>F</td>
+<td>2022</td>
+<td>Andrea</td>
+<td>322</td>
+<td>A</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">405695</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Deandre</td>
+<td>18</td>
+<td>D</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Now, we’re ready to join the two tables. <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html"><code>pd.merge</code></a> is the <code>pandas</code> method used to join <code>DataFrame</code>s together.</p>
+<div id="398c9500" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="48">
+<div class="sourceCode cell-code" id="cb69"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb69-1"><a href="#cb69-1" aria-hidden="true" tabindex="-1"></a>merged <span class="op">=</span> pd.merge(left <span class="op">=</span> elections, right <span class="op">=</span> babynames_2022, <span class="op">\</span></span>
+<span id="cb69-2"><a href="#cb69-2" aria-hidden="true" tabindex="-1"></a>                  left_on <span class="op">=</span> <span class="st">"First Name"</span>, right_on <span class="op">=</span> <span class="st">"Name"</span>)</span>
+<span id="cb69-3"><a href="#cb69-3" aria-hidden="true" tabindex="-1"></a>merged.head()</span>
+<span id="cb69-4"><a href="#cb69-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Notice that pandas automatically specifies `Year_x` and `Year_y` </span></span>
+<span id="cb69-5"><a href="#cb69-5" aria-hidden="true" tabindex="-1"></a><span class="co"># when both merged DataFrames have the same column name to avoid confusion</span></span>
+<span id="cb69-6"><a href="#cb69-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb69-7"><a href="#cb69-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Second option</span></span>
+<span id="cb69-8"><a href="#cb69-8" aria-hidden="true" tabindex="-1"></a><span class="co"># merged = elections.merge(right = babynames_2022, \</span></span>
+<span id="cb69-9"><a href="#cb69-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># left_on = "First Name", right_on = "Name")</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="48">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Year_x</th>
+<th data-quarto-table-cell-role="th">Candidate</th>
+<th data-quarto-table-cell-role="th">Party</th>
+<th data-quarto-table-cell-role="th">Popular vote</th>
+<th data-quarto-table-cell-role="th">Result</th>
+<th data-quarto-table-cell-role="th">%</th>
+<th data-quarto-table-cell-role="th">First Name</th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">Sex</th>
+<th data-quarto-table-cell-role="th">Year_y</th>
+<th data-quarto-table-cell-role="th">Name</th>
+<th data-quarto-table-cell-role="th">Count</th>
+<th data-quarto-table-cell-role="th">First Letter</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1824</td>
+<td>Andrew Jackson</td>
+<td>Democratic-Republican</td>
+<td>151271</td>
+<td>loss</td>
+<td>57.210122</td>
+<td>Andrew</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Andrew</td>
+<td>741</td>
+<td>A</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1824</td>
+<td>John Quincy Adams</td>
+<td>Democratic-Republican</td>
+<td>113142</td>
+<td>win</td>
+<td>42.789878</td>
+<td>John</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>John</td>
+<td>490</td>
+<td>J</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1828</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>642806</td>
+<td>win</td>
+<td>56.203927</td>
+<td>Andrew</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Andrew</td>
+<td>741</td>
+<td>A</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1828</td>
+<td>John Quincy Adams</td>
+<td>National Republican</td>
+<td>500897</td>
+<td>loss</td>
+<td>43.796073</td>
+<td>John</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>John</td>
+<td>490</td>
+<td>J</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1832</td>
+<td>Andrew Jackson</td>
+<td>Democratic</td>
+<td>702735</td>
+<td>win</td>
+<td>54.574789</td>
+<td>Andrew</td>
+<td>CA</td>
+<td>M</td>
+<td>2022</td>
+<td>Andrew</td>
+<td>741</td>
+<td>A</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Let’s take a closer look at the parameters:</p>
+<ul>
+<li><code>left</code> and <code>right</code> parameters are used to specify the <code>DataFrame</code>s to be joined.</li>
+<li><code>left_on</code> and <code>right_on</code> parameters are assigned to the string names of the columns to be used when performing the join. These two <code>on</code> parameters tell <code>pandas</code> what values should act as pairing keys to determine which rows to merge across the <code>DataFrame</code>s. We’ll talk more about this idea of a pairing key next lecture.</li>
+</ul>
+</section>
+<section id="parting-note" class="level2" data-number="4.6">
+<h2 data-number="4.6" class="anchored" data-anchor-id="parting-note"><span class="header-section-number">4.6</span> Parting Note</h2>
+<p>Congratulations! We finally tackled <code>pandas</code>. Don’t worry if you are still not feeling very comfortable with it—you will have plenty of chances to practice over the next few weeks.</p>
+<p>Next, we will get our hands dirty with some real-world datasets and use our <code>pandas</code> knowledge to conduct some exploratory data analysis.</p>
+
+
+<!-- -->
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../pandas_2/pandas_2.html" class="pagination-link" aria-label="Pandas II">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../eda/eda.html" class="pagination-link" aria-label="Data Cleaning and EDA">
+        <span class="nav-page-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb70" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb70-1"><a href="#cb70-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb70-2"><a href="#cb70-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Pandas III</span></span>
+<span id="cb70-3"><a href="#cb70-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb70-4"><a href="#cb70-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb70-5"><a href="#cb70-5" aria-hidden="true" tabindex="-1"></a><span class="co">  enabled: true</span></span>
+<span id="cb70-6"><a href="#cb70-6" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb70-7"><a href="#cb70-7" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb70-8"><a href="#cb70-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: false</span></span>
+<span id="cb70-9"><a href="#cb70-9" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb70-10"><a href="#cb70-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb70-11"><a href="#cb70-11" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Pandas III</span></span>
+<span id="cb70-12"><a href="#cb70-12" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb70-13"><a href="#cb70-13" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb70-14"><a href="#cb70-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb70-15"><a href="#cb70-15" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb70-16"><a href="#cb70-16" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb70-17"><a href="#cb70-17" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb70-18"><a href="#cb70-18" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb70-19"><a href="#cb70-19" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb70-20"><a href="#cb70-20" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb70-21"><a href="#cb70-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb70-22"><a href="#cb70-22" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb70-23"><a href="#cb70-23" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb70-24"><a href="#cb70-24" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb70-25"><a href="#cb70-25" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb70-26"><a href="#cb70-26" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb70-27"><a href="#cb70-27" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb70-28"><a href="#cb70-28" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb70-29"><a href="#cb70-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-30"><a href="#cb70-30" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb70-31"><a href="#cb70-31" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb70-32"><a href="#cb70-32" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-33"><a href="#cb70-33" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Perform advanced aggregation using <span class="in">`.groupby()`</span></span>
+<span id="cb70-34"><a href="#cb70-34" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Use the <span class="in">`pd.pivot_table`</span> method to construct a pivot table</span>
+<span id="cb70-35"><a href="#cb70-35" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Perform simple merges between DataFrames using <span class="in">`pd.merge()`</span></span>
+<span id="cb70-36"><a href="#cb70-36" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb70-37"><a href="#cb70-37" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-38"><a href="#cb70-38" aria-hidden="true" tabindex="-1"></a>We will introduce the concept of aggregating data – we will familiarize ourselves with <span class="in">`GroupBy`</span> objects and used them as tools to consolidate and summarize a<span class="in">`DataFrame`</span>. In this lecture, we will explore working with the different aggregation functions and dive into some advanced <span class="in">`.groupby`</span> methods to show just how powerful of a resource they can be for understanding our data. We will also introduce other techniques for data aggregation to provide flexibility in how we manipulate our tables. </span>
+<span id="cb70-39"><a href="#cb70-39" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-40"><a href="#cb70-40" aria-hidden="true" tabindex="-1"></a><span class="fu">## Custom Sorts</span></span>
+<span id="cb70-41"><a href="#cb70-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-42"><a href="#cb70-42" aria-hidden="true" tabindex="-1"></a>First, let's finish our discussion about sorting. Let's try to solve a sorting problem using different approaches. Assume we want to find the longest baby names and sort our data accordingly.</span>
+<span id="cb70-43"><a href="#cb70-43" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-44"><a href="#cb70-44" aria-hidden="true" tabindex="-1"></a>We'll start by loading the <span class="in">`babynames`</span> dataset. Note that this dataset is filtered to only contain data from California.</span>
+<span id="cb70-45"><a href="#cb70-45" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-48"><a href="#cb70-48" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-49"><a href="#cb70-49" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb70-50"><a href="#cb70-50" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-51"><a href="#cb70-51" aria-hidden="true" tabindex="-1"></a><span class="co"># This code pulls census data and loads it into a DataFrame</span></span>
+<span id="cb70-52"><a href="#cb70-52" aria-hidden="true" tabindex="-1"></a><span class="co"># We won't cover it explicitly in this class, but you are welcome to explore it on your own</span></span>
+<span id="cb70-53"><a href="#cb70-53" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb70-54"><a href="#cb70-54" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb70-55"><a href="#cb70-55" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> urllib.request</span>
+<span id="cb70-56"><a href="#cb70-56" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os.path</span>
+<span id="cb70-57"><a href="#cb70-57" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> zipfile</span>
+<span id="cb70-58"><a href="#cb70-58" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-59"><a href="#cb70-59" aria-hidden="true" tabindex="-1"></a>data_url <span class="op">=</span> <span class="st">"https://www.ssa.gov/oact/babynames/state/namesbystate.zip"</span></span>
+<span id="cb70-60"><a href="#cb70-60" aria-hidden="true" tabindex="-1"></a>local_filename <span class="op">=</span> <span class="st">"data/babynamesbystate.zip"</span></span>
+<span id="cb70-61"><a href="#cb70-61" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> <span class="kw">not</span> os.path.exists(local_filename): <span class="co"># If the data exists don't download again</span></span>
+<span id="cb70-62"><a href="#cb70-62" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> urllib.request.urlopen(data_url) <span class="im">as</span> resp, <span class="bu">open</span>(local_filename, <span class="st">'wb'</span>) <span class="im">as</span> f:</span>
+<span id="cb70-63"><a href="#cb70-63" aria-hidden="true" tabindex="-1"></a>        f.write(resp.read())</span>
+<span id="cb70-64"><a href="#cb70-64" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-65"><a href="#cb70-65" aria-hidden="true" tabindex="-1"></a>zf <span class="op">=</span> zipfile.ZipFile(local_filename, <span class="st">'r'</span>)</span>
+<span id="cb70-66"><a href="#cb70-66" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-67"><a href="#cb70-67" aria-hidden="true" tabindex="-1"></a>ca_name <span class="op">=</span> <span class="st">'STATE.CA.TXT'</span></span>
+<span id="cb70-68"><a href="#cb70-68" aria-hidden="true" tabindex="-1"></a>field_names <span class="op">=</span> [<span class="st">'State'</span>, <span class="st">'Sex'</span>, <span class="st">'Year'</span>, <span class="st">'Name'</span>, <span class="st">'Count'</span>]</span>
+<span id="cb70-69"><a href="#cb70-69" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> zf.<span class="bu">open</span>(ca_name) <span class="im">as</span> fh:</span>
+<span id="cb70-70"><a href="#cb70-70" aria-hidden="true" tabindex="-1"></a>    babynames <span class="op">=</span> pd.read_csv(fh, header<span class="op">=</span><span class="va">None</span>, names<span class="op">=</span>field_names)</span>
+<span id="cb70-71"><a href="#cb70-71" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-72"><a href="#cb70-72" aria-hidden="true" tabindex="-1"></a>babynames.tail(<span class="dv">10</span>)</span>
+<span id="cb70-73"><a href="#cb70-73" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-74"><a href="#cb70-74" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-75"><a href="#cb70-75" aria-hidden="true" tabindex="-1"></a><span class="fu">### Approach 1: Create a Temporary Column</span></span>
+<span id="cb70-76"><a href="#cb70-76" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-77"><a href="#cb70-77" aria-hidden="true" tabindex="-1"></a>One method to do this is to first start by creating a column that contains the lengths of the names.</span>
+<span id="cb70-78"><a href="#cb70-78" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-81"><a href="#cb70-81" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-82"><a href="#cb70-82" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-83"><a href="#cb70-83" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a Series of the length of each name</span></span>
+<span id="cb70-84"><a href="#cb70-84" aria-hidden="true" tabindex="-1"></a>babyname_lengths <span class="op">=</span> babynames[<span class="st">"Name"</span>].<span class="bu">str</span>.<span class="bu">len</span>()</span>
+<span id="cb70-85"><a href="#cb70-85" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-86"><a href="#cb70-86" aria-hidden="true" tabindex="-1"></a><span class="co"># Add a column named "name_lengths" that includes the length of each name</span></span>
+<span id="cb70-87"><a href="#cb70-87" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"name_lengths"</span>] <span class="op">=</span> babyname_lengths</span>
+<span id="cb70-88"><a href="#cb70-88" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span>
+<span id="cb70-89"><a href="#cb70-89" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-90"><a href="#cb70-90" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-91"><a href="#cb70-91" aria-hidden="true" tabindex="-1"></a>We can then sort the <span class="in">`DataFrame`</span> by that column using <span class="in">`.sort_values()`</span>:</span>
+<span id="cb70-92"><a href="#cb70-92" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-95"><a href="#cb70-95" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-96"><a href="#cb70-96" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-97"><a href="#cb70-97" aria-hidden="true" tabindex="-1"></a><span class="co"># Sort by the temporary column</span></span>
+<span id="cb70-98"><a href="#cb70-98" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.sort_values(by<span class="op">=</span><span class="st">"name_lengths"</span>, ascending<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb70-99"><a href="#cb70-99" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span>
+<span id="cb70-100"><a href="#cb70-100" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-101"><a href="#cb70-101" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-102"><a href="#cb70-102" aria-hidden="true" tabindex="-1"></a>Finally, we can drop the <span class="in">`name_length`</span> column from <span class="in">`babynames`</span> to prevent our table from getting cluttered.</span>
+<span id="cb70-103"><a href="#cb70-103" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-106"><a href="#cb70-106" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-107"><a href="#cb70-107" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-108"><a href="#cb70-108" aria-hidden="true" tabindex="-1"></a><span class="co"># Drop the 'name_length' column</span></span>
+<span id="cb70-109"><a href="#cb70-109" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.drop(<span class="st">"name_lengths"</span>, axis<span class="op">=</span><span class="st">'columns'</span>)</span>
+<span id="cb70-110"><a href="#cb70-110" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span>
+<span id="cb70-111"><a href="#cb70-111" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-112"><a href="#cb70-112" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-113"><a href="#cb70-113" aria-hidden="true" tabindex="-1"></a><span class="fu">### Approach 2: Sorting using the `key` Argument</span></span>
+<span id="cb70-114"><a href="#cb70-114" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-115"><a href="#cb70-115" aria-hidden="true" tabindex="-1"></a>Another way to approach this is to use the <span class="in">`key`</span> argument of <span class="in">`.sort_values()`</span>. Here we can specify that we want to sort <span class="in">`"Name"`</span> values by their length.</span>
+<span id="cb70-116"><a href="#cb70-116" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-119"><a href="#cb70-119" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-120"><a href="#cb70-120" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-121"><a href="#cb70-121" aria-hidden="true" tabindex="-1"></a>babynames.sort_values(<span class="st">"Name"</span>, key<span class="op">=</span><span class="kw">lambda</span> x: x.<span class="bu">str</span>.<span class="bu">len</span>(), ascending<span class="op">=</span><span class="va">False</span>).head()</span>
+<span id="cb70-122"><a href="#cb70-122" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-123"><a href="#cb70-123" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-124"><a href="#cb70-124" aria-hidden="true" tabindex="-1"></a><span class="fu">### Approach 3: Sorting using the `map` Function</span></span>
+<span id="cb70-125"><a href="#cb70-125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-126"><a href="#cb70-126" aria-hidden="true" tabindex="-1"></a>We can also use the <span class="in">`map`</span> function on a <span class="in">`Series`</span> to solve this. Say we want to sort the <span class="in">`babynames`</span> table by the number of <span class="in">`"dr"`</span>'s and <span class="in">`"ea"`</span>'s in each <span class="in">`"Name"`</span>. We'll define the function <span class="in">`dr_ea_count`</span> to help us out.</span>
+<span id="cb70-127"><a href="#cb70-127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-130"><a href="#cb70-130" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-131"><a href="#cb70-131" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-132"><a href="#cb70-132" aria-hidden="true" tabindex="-1"></a><span class="co"># First, define a function to count the number of times "dr" or "ea" appear in each name</span></span>
+<span id="cb70-133"><a href="#cb70-133" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> dr_ea_count(string):</span>
+<span id="cb70-134"><a href="#cb70-134" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> string.count(<span class="st">'dr'</span>) <span class="op">+</span> string.count(<span class="st">'ea'</span>)</span>
+<span id="cb70-135"><a href="#cb70-135" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-136"><a href="#cb70-136" aria-hidden="true" tabindex="-1"></a><span class="co"># Then, use `map` to apply `dr_ea_count` to each name in the "Name" column</span></span>
+<span id="cb70-137"><a href="#cb70-137" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"dr_ea_count"</span>] <span class="op">=</span> babynames[<span class="st">"Name"</span>].<span class="bu">map</span>(dr_ea_count)</span>
+<span id="cb70-138"><a href="#cb70-138" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-139"><a href="#cb70-139" aria-hidden="true" tabindex="-1"></a><span class="co"># Sort the DataFrame by the new "dr_ea_count" column so we can see our handiwork</span></span>
+<span id="cb70-140"><a href="#cb70-140" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.sort_values(by<span class="op">=</span><span class="st">"dr_ea_count"</span>, ascending<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb70-141"><a href="#cb70-141" aria-hidden="true" tabindex="-1"></a>babynames.head()</span>
+<span id="cb70-142"><a href="#cb70-142" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-143"><a href="#cb70-143" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-144"><a href="#cb70-144" aria-hidden="true" tabindex="-1"></a>We can drop the <span class="in">`dr_ea_count`</span> once we're done using it to maintain a neat table.</span>
+<span id="cb70-145"><a href="#cb70-145" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-148"><a href="#cb70-148" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-149"><a href="#cb70-149" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-150"><a href="#cb70-150" aria-hidden="true" tabindex="-1"></a><span class="co"># Drop the `dr_ea_count` column</span></span>
+<span id="cb70-151"><a href="#cb70-151" aria-hidden="true" tabindex="-1"></a>babynames <span class="op">=</span> babynames.drop(<span class="st">"dr_ea_count"</span>, axis <span class="op">=</span> <span class="st">'columns'</span>)</span>
+<span id="cb70-152"><a href="#cb70-152" aria-hidden="true" tabindex="-1"></a>babynames.head(<span class="dv">5</span>)</span>
+<span id="cb70-153"><a href="#cb70-153" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-154"><a href="#cb70-154" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-155"><a href="#cb70-155" aria-hidden="true" tabindex="-1"></a><span class="fu">## Aggregating Data with `.groupby`</span></span>
+<span id="cb70-156"><a href="#cb70-156" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-157"><a href="#cb70-157" aria-hidden="true" tabindex="-1"></a>Up until this point, we have been working with individual rows of <span class="in">`DataFrame`</span>s. As data scientists, we often wish to investigate trends across a larger *subset* of our data. For example, we may want to compute some summary statistic (the mean, median, sum, etc.) for a group of rows in our <span class="in">`DataFrame`</span>. To do this, we'll use <span class="in">`pandas`</span> <span class="in">`GroupBy`</span> objects. Our goal is to group together rows that fall under the same category and perform an operation that aggregates across all rows in the category. </span>
+<span id="cb70-158"><a href="#cb70-158" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-159"><a href="#cb70-159" aria-hidden="true" tabindex="-1"></a>Let's say we wanted to aggregate all rows in <span class="in">`babynames`</span> for a given year. </span>
+<span id="cb70-160"><a href="#cb70-160" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-163"><a href="#cb70-163" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-164"><a href="#cb70-164" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-165"><a href="#cb70-165" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-166"><a href="#cb70-166" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Year"</span>)</span>
+<span id="cb70-167"><a href="#cb70-167" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-168"><a href="#cb70-168" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-169"><a href="#cb70-169" aria-hidden="true" tabindex="-1"></a>What does this strange output mean? Calling <span class="in">`.groupby`</span> <span class="co">[</span><span class="ot">(documentation)</span><span class="co">](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html)</span> has generated a <span class="in">`GroupBy`</span> object. You can imagine this as a set of "mini" sub-<span class="in">`DataFrame`</span>s, where each subframe contains all of the rows from <span class="in">`babynames`</span> that correspond to a particular year. </span>
+<span id="cb70-170"><a href="#cb70-170" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-171"><a href="#cb70-171" aria-hidden="true" tabindex="-1"></a>The diagram below shows a simplified view of <span class="in">`babynames`</span> to help illustrate this idea.</span>
+<span id="cb70-172"><a href="#cb70-172" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-173"><a href="#cb70-173" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src = "images/gb.png" width = "600"&gt;&lt;/img&gt;&lt;/a&gt;&lt;/center&gt;</span>
+<span id="cb70-174"><a href="#cb70-174" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-175"><a href="#cb70-175" aria-hidden="true" tabindex="-1"></a>We can't work with a <span class="in">`GroupBy`</span> object directly – that is why you saw that strange output earlier rather than a standard view of a <span class="in">`DataFrame`</span>. To actually manipulate values within these "mini" <span class="in">`DataFrame`</span>s, we'll need to call an *aggregation method*. This is a method that tells <span class="in">`pandas`</span> how to aggregate the values within the <span class="in">`GroupBy`</span> object. Once the aggregation is applied, <span class="in">`pandas`</span> will return a normal (now grouped) <span class="in">`DataFrame`</span>.</span>
+<span id="cb70-176"><a href="#cb70-176" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-177"><a href="#cb70-177" aria-hidden="true" tabindex="-1"></a>The first aggregation method we'll consider is <span class="in">`.agg`</span>. The <span class="in">`.agg`</span> method takes in a function as its argument; this function is then applied to each column of a "mini" grouped DataFrame. We end up with a new <span class="in">`DataFrame`</span> with one aggregated row per subframe. Let's see this in action by finding the <span class="in">`sum`</span> of all counts for each year in <span class="in">`babynames`</span> – this is equivalent to finding the number of babies born in each year. </span>
+<span id="cb70-178"><a href="#cb70-178" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-181"><a href="#cb70-181" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-182"><a href="#cb70-182" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-183"><a href="#cb70-183" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-184"><a href="#cb70-184" aria-hidden="true" tabindex="-1"></a>babynames[[<span class="st">"Year"</span>, <span class="st">"Count"</span>]].groupby(<span class="st">"Year"</span>).agg(<span class="bu">sum</span>).head(<span class="dv">5</span>)</span>
+<span id="cb70-185"><a href="#cb70-185" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-186"><a href="#cb70-186" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-187"><a href="#cb70-187" aria-hidden="true" tabindex="-1"></a>We can relate this back to the diagram we used above. Remember that the diagram uses a simplified version of <span class="in">`babynames`</span>, which is why we see smaller values for the summed counts.</span>
+<span id="cb70-188"><a href="#cb70-188" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-189"><a href="#cb70-189" aria-hidden="true" tabindex="-1"></a><span class="al">![Performing an aggregation](images/agg.png)</span></span>
+<span id="cb70-190"><a href="#cb70-190" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-191"><a href="#cb70-191" aria-hidden="true" tabindex="-1"></a>Calling <span class="in">`.agg`</span> has condensed each subframe back into a single row. This gives us our final output: a <span class="in">`DataFrame`</span> that is now indexed by <span class="in">`"Year"`</span>, with a single row for each unique year in the original <span class="in">`babynames`</span> DataFrame.</span>
+<span id="cb70-192"><a href="#cb70-192" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-193"><a href="#cb70-193" aria-hidden="true" tabindex="-1"></a>There are many different aggregation functions we can use, all of which are useful in different applications.</span>
+<span id="cb70-194"><a href="#cb70-194" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-197"><a href="#cb70-197" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-198"><a href="#cb70-198" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-199"><a href="#cb70-199" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-200"><a href="#cb70-200" aria-hidden="true" tabindex="-1"></a>babynames[[<span class="st">"Year"</span>, <span class="st">"Count"</span>]].groupby(<span class="st">"Year"</span>).agg(<span class="bu">min</span>).head(<span class="dv">5</span>)</span>
+<span id="cb70-201"><a href="#cb70-201" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-202"><a href="#cb70-202" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-205"><a href="#cb70-205" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-206"><a href="#cb70-206" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-207"><a href="#cb70-207" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-208"><a href="#cb70-208" aria-hidden="true" tabindex="-1"></a>babynames[[<span class="st">"Year"</span>, <span class="st">"Count"</span>]].groupby(<span class="st">"Year"</span>).agg(<span class="bu">max</span>).head(<span class="dv">5</span>)</span>
+<span id="cb70-209"><a href="#cb70-209" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-210"><a href="#cb70-210" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-213"><a href="#cb70-213" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-214"><a href="#cb70-214" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-215"><a href="#cb70-215" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-216"><a href="#cb70-216" aria-hidden="true" tabindex="-1"></a><span class="co"># Same result, but now we explicitly tell pandas to only consider the "Count" column when summing</span></span>
+<span id="cb70-217"><a href="#cb70-217" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Year"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>).head(<span class="dv">5</span>)</span>
+<span id="cb70-218"><a href="#cb70-218" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-219"><a href="#cb70-219" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-220"><a href="#cb70-220" aria-hidden="true" tabindex="-1"></a>There are many different aggregations that can be applied to the grouped data. The primary requirement is that an aggregation function must:</span>
+<span id="cb70-221"><a href="#cb70-221" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-222"><a href="#cb70-222" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Take in a <span class="in">`Series`</span> of data (a single column of the grouped subframe).</span>
+<span id="cb70-223"><a href="#cb70-223" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Return a single value that aggregates this <span class="in">`Series`</span>.</span>
+<span id="cb70-224"><a href="#cb70-224" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-225"><a href="#cb70-225" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-226"><a href="#cb70-226" aria-hidden="true" tabindex="-1"></a><span class="fu">### Aggregation Functions</span></span>
+<span id="cb70-227"><a href="#cb70-227" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-228"><a href="#cb70-228" aria-hidden="true" tabindex="-1"></a>Because of this fairly broad requirement, <span class="in">`pandas`</span> offers many ways of computing an aggregation.</span>
+<span id="cb70-229"><a href="#cb70-229" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-230"><a href="#cb70-230" aria-hidden="true" tabindex="-1"></a>**In-built** Python operations – such as <span class="in">`sum`</span>, <span class="in">`max`</span>, and <span class="in">`min`</span> – are automatically recognized by <span class="in">`pandas`</span>.</span>
+<span id="cb70-231"><a href="#cb70-231" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-234"><a href="#cb70-234" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-235"><a href="#cb70-235" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-236"><a href="#cb70-236" aria-hidden="true" tabindex="-1"></a><span class="co"># What is the minimum count for each name in any year?</span></span>
+<span id="cb70-237"><a href="#cb70-237" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">min</span>).head()</span>
+<span id="cb70-238"><a href="#cb70-238" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-239"><a href="#cb70-239" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-242"><a href="#cb70-242" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-243"><a href="#cb70-243" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-244"><a href="#cb70-244" aria-hidden="true" tabindex="-1"></a><span class="co"># What is the largest single-year count of each name?</span></span>
+<span id="cb70-245"><a href="#cb70-245" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">max</span>).head()</span>
+<span id="cb70-246"><a href="#cb70-246" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-247"><a href="#cb70-247" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-248"><a href="#cb70-248" aria-hidden="true" tabindex="-1"></a>As mentioned previously, functions from the <span class="in">`NumPy`</span> library, such as <span class="in">`np.mean`</span>, <span class="in">`np.max`</span>, <span class="in">`np.min`</span>, and <span class="in">`np.sum`</span>, are also fair game in <span class="in">`pandas`</span>.</span>
+<span id="cb70-249"><a href="#cb70-249" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-252"><a href="#cb70-252" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-253"><a href="#cb70-253" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-254"><a href="#cb70-254" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-255"><a href="#cb70-255" aria-hidden="true" tabindex="-1"></a><span class="co"># What is the average count for each name across all years?</span></span>
+<span id="cb70-256"><a href="#cb70-256" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Count"</span>]].agg(np.mean).head()</span>
+<span id="cb70-257"><a href="#cb70-257" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-258"><a href="#cb70-258" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-259"><a href="#cb70-259" aria-hidden="true" tabindex="-1"></a><span class="in">`pandas`</span> also offers a number of in-built functions. Functions that are native to <span class="in">`pandas`</span> can be referenced using their string name within a call to <span class="in">`.agg`</span>. Some examples include:</span>
+<span id="cb70-260"><a href="#cb70-260" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-261"><a href="#cb70-261" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`.agg("sum")`</span></span>
+<span id="cb70-262"><a href="#cb70-262" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`.agg("max")`</span></span>
+<span id="cb70-263"><a href="#cb70-263" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`.agg("min")`</span></span>
+<span id="cb70-264"><a href="#cb70-264" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`.agg("mean")`</span></span>
+<span id="cb70-265"><a href="#cb70-265" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`.agg("first")`</span></span>
+<span id="cb70-266"><a href="#cb70-266" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`.agg("last")`</span></span>
+<span id="cb70-267"><a href="#cb70-267" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-268"><a href="#cb70-268" aria-hidden="true" tabindex="-1"></a>The latter two entries in this list – <span class="in">`"first"`</span> and <span class="in">`"last"`</span> – are unique to <span class="in">`pandas`</span>. They return the first or last entry in a subframe column. Why might this be useful? Consider a case where *multiple* columns in a group share identical information. To represent this information in the grouped output, we can simply grab the first or last entry, which we know will be identical to all other entries.</span>
+<span id="cb70-269"><a href="#cb70-269" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-270"><a href="#cb70-270" aria-hidden="true" tabindex="-1"></a>Let's illustrate this with an example. Say we add a new column to <span class="in">`babynames`</span> that contains the first letter of each name. </span>
+<span id="cb70-271"><a href="#cb70-271" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-274"><a href="#cb70-274" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-275"><a href="#cb70-275" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-276"><a href="#cb70-276" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-277"><a href="#cb70-277" aria-hidden="true" tabindex="-1"></a><span class="co"># Imagine we had an additional column, "First Letter". We'll explain this code next week</span></span>
+<span id="cb70-278"><a href="#cb70-278" aria-hidden="true" tabindex="-1"></a>babynames[<span class="st">"First Letter"</span>] <span class="op">=</span> babynames[<span class="st">"Name"</span>].<span class="bu">str</span>[<span class="dv">0</span>]</span>
+<span id="cb70-279"><a href="#cb70-279" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-280"><a href="#cb70-280" aria-hidden="true" tabindex="-1"></a><span class="co"># We construct a simplified DataFrame containing just a subset of columns</span></span>
+<span id="cb70-281"><a href="#cb70-281" aria-hidden="true" tabindex="-1"></a>babynames_new <span class="op">=</span> babynames[[<span class="st">"Name"</span>, <span class="st">"First Letter"</span>, <span class="st">"Year"</span>]]</span>
+<span id="cb70-282"><a href="#cb70-282" aria-hidden="true" tabindex="-1"></a>babynames_new.head()</span>
+<span id="cb70-283"><a href="#cb70-283" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-284"><a href="#cb70-284" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-285"><a href="#cb70-285" aria-hidden="true" tabindex="-1"></a>If we form groups for each name in the dataset, <span class="in">`"First Letter"`</span> will be the same for all members of the group. This means that if we simply select the first entry for <span class="in">`"First Letter"`</span> in the group, we'll represent all data in that group. </span>
+<span id="cb70-286"><a href="#cb70-286" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-287"><a href="#cb70-287" aria-hidden="true" tabindex="-1"></a>We can use a dictionary to apply different aggregation functions to each column during grouping.</span>
+<span id="cb70-288"><a href="#cb70-288" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-289"><a href="#cb70-289" aria-hidden="true" tabindex="-1"></a><span class="al">![Aggregating using "first"](images/first.png)</span></span>
+<span id="cb70-290"><a href="#cb70-290" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-293"><a href="#cb70-293" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-294"><a href="#cb70-294" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-295"><a href="#cb70-295" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-296"><a href="#cb70-296" aria-hidden="true" tabindex="-1"></a>babynames_new.groupby(<span class="st">"Name"</span>).agg({<span class="st">"First Letter"</span>:<span class="st">"first"</span>, <span class="st">"Year"</span>:<span class="st">"max"</span>}).head()</span>
+<span id="cb70-297"><a href="#cb70-297" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-298"><a href="#cb70-298" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-299"><a href="#cb70-299" aria-hidden="true" tabindex="-1"></a><span class="fu">### Plotting Birth Counts</span></span>
+<span id="cb70-300"><a href="#cb70-300" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-301"><a href="#cb70-301" aria-hidden="true" tabindex="-1"></a>Let's use <span class="in">`.agg`</span> to find the total number of babies born in each year. Recall that using <span class="in">`.agg`</span> with <span class="in">`.groupby()`</span> follows the format: <span class="in">`df.groupby(column_name).agg(aggregation_function)`</span>. The line of code below gives us the total number of babies born in each year.</span>
+<span id="cb70-302"><a href="#cb70-302" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-305"><a href="#cb70-305" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-306"><a href="#cb70-306" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb70-307"><a href="#cb70-307" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-308"><a href="#cb70-308" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Year"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>).head(<span class="dv">5</span>)</span>
+<span id="cb70-309"><a href="#cb70-309" aria-hidden="true" tabindex="-1"></a><span class="co"># Alternative 1</span></span>
+<span id="cb70-310"><a href="#cb70-310" aria-hidden="true" tabindex="-1"></a><span class="co"># babynames.groupby("Year")[["Count"]].sum()</span></span>
+<span id="cb70-311"><a href="#cb70-311" aria-hidden="true" tabindex="-1"></a><span class="co"># Alternative 2</span></span>
+<span id="cb70-312"><a href="#cb70-312" aria-hidden="true" tabindex="-1"></a><span class="co"># babynames.groupby("Year").sum(numeric_only=True)</span></span>
+<span id="cb70-313"><a href="#cb70-313" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-314"><a href="#cb70-314" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-315"><a href="#cb70-315" aria-hidden="true" tabindex="-1"></a>Here's an illustration of the process: </span>
+<span id="cb70-316"><a href="#cb70-316" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-317"><a href="#cb70-317" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/aggregation.png" alt='aggregation' width='600'&gt;</span>
+<span id="cb70-318"><a href="#cb70-318" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-319"><a href="#cb70-319" aria-hidden="true" tabindex="-1"></a>Plotting the <span class="in">`Dataframe`</span> we obtain tells an interesting story.</span>
+<span id="cb70-320"><a href="#cb70-320" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-323"><a href="#cb70-323" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-324"><a href="#cb70-324" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb70-325"><a href="#cb70-325" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-326"><a href="#cb70-326" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.express <span class="im">as</span> px</span>
+<span id="cb70-327"><a href="#cb70-327" aria-hidden="true" tabindex="-1"></a>puzzle2 <span class="op">=</span> babynames.groupby(<span class="st">"Year"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>)</span>
+<span id="cb70-328"><a href="#cb70-328" aria-hidden="true" tabindex="-1"></a>px.line(puzzle2, y <span class="op">=</span> <span class="st">"Count"</span>)</span>
+<span id="cb70-329"><a href="#cb70-329" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-330"><a href="#cb70-330" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-331"><a href="#cb70-331" aria-hidden="true" tabindex="-1"></a>**A word of warning**: we made an enormous assumption when we decided to use this dataset to estimate birth rate. According to <span class="co">[</span><span class="ot">this article from the Legistlative Analyst Office</span><span class="co">](https://lao.ca.gov/LAOEconTax/Article/Detail/691)</span>, the true number of babies born in California in 2020 was 421,275. However, our plot shows 362,882 babies —— what happened? </span>
+<span id="cb70-332"><a href="#cb70-332" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-333"><a href="#cb70-333" aria-hidden="true" tabindex="-1"></a><span class="fu">### Summary of the `.groupby()` Function</span></span>
+<span id="cb70-334"><a href="#cb70-334" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-335"><a href="#cb70-335" aria-hidden="true" tabindex="-1"></a>A <span class="in">`groupby`</span> operation involves some combination of **splitting a `DataFrame` into grouped subframes**, **applying a function**, and **combining the results**. </span>
+<span id="cb70-336"><a href="#cb70-336" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-337"><a href="#cb70-337" aria-hidden="true" tabindex="-1"></a>For some arbitrary <span class="in">`DataFrame`</span> <span class="in">`df`</span> below, the code <span class="in">`df.groupby("year").agg(sum)`</span> does the following:</span>
+<span id="cb70-338"><a href="#cb70-338" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-339"><a href="#cb70-339" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>**Splits** the <span class="in">`DataFrame`</span> into sub-<span class="in">`DataFrame`</span>s with rows belonging to the same year.</span>
+<span id="cb70-340"><a href="#cb70-340" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>**Applies** the <span class="in">`sum`</span> function to each column of each sub-<span class="in">`DataFrame`</span>.</span>
+<span id="cb70-341"><a href="#cb70-341" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>**Combines** the results of <span class="in">`sum`</span> into a single <span class="in">`DataFrame`</span>, indexed by <span class="in">`year`</span>.</span>
+<span id="cb70-342"><a href="#cb70-342" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-343"><a href="#cb70-343" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/groupby_demo.png" alt='groupby_demo' width='600'&gt;</span>
+<span id="cb70-344"><a href="#cb70-344" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-345"><a href="#cb70-345" aria-hidden="true" tabindex="-1"></a><span class="fu">### Revisiting the `.agg()` Function</span></span>
+<span id="cb70-346"><a href="#cb70-346" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-347"><a href="#cb70-347" aria-hidden="true" tabindex="-1"></a><span class="in">`.agg()`</span> can take in any function that aggregates several values into one summary value. Some commonly-used aggregation functions can even be called directly, without explicit use of <span class="in">`.agg()`</span>. For example, we can call <span class="in">`.mean()`</span> on <span class="in">`.groupby()`</span>: </span>
+<span id="cb70-348"><a href="#cb70-348" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb70-349"><a href="#cb70-349" aria-hidden="true" tabindex="-1"></a><span class="in">    babynames.groupby("Year").mean().head()</span></span>
+<span id="cb70-350"><a href="#cb70-350" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-351"><a href="#cb70-351" aria-hidden="true" tabindex="-1"></a>We can now put this all into practice. Say we want to find the baby name with sex "F" that has fallen in popularity the most in California. To calculate this, we can first create a metric: "Ratio to Peak" (RTP). The RTP is the ratio of babies born with a given name in 2022 to the *maximum* number of babies born with the name in *any* year. </span>
+<span id="cb70-352"><a href="#cb70-352" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-353"><a href="#cb70-353" aria-hidden="true" tabindex="-1"></a>Let's start with calculating this for one baby, "Jennifer".</span>
+<span id="cb70-354"><a href="#cb70-354" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-357"><a href="#cb70-357" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-358"><a href="#cb70-358" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-359"><a href="#cb70-359" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-360"><a href="#cb70-360" aria-hidden="true" tabindex="-1"></a><span class="co"># We filter by babies with sex "F" and sort by "Year"</span></span>
+<span id="cb70-361"><a href="#cb70-361" aria-hidden="true" tabindex="-1"></a>f_babynames <span class="op">=</span> babynames[babynames[<span class="st">"Sex"</span>] <span class="op">==</span> <span class="st">"F"</span>]</span>
+<span id="cb70-362"><a href="#cb70-362" aria-hidden="true" tabindex="-1"></a>f_babynames <span class="op">=</span> f_babynames.sort_values([<span class="st">"Year"</span>])</span>
+<span id="cb70-363"><a href="#cb70-363" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-364"><a href="#cb70-364" aria-hidden="true" tabindex="-1"></a><span class="co"># Determine how many Jennifers were born in CA per year</span></span>
+<span id="cb70-365"><a href="#cb70-365" aria-hidden="true" tabindex="-1"></a>jenn_counts_series <span class="op">=</span> f_babynames[f_babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Jennifer"</span>][<span class="st">"Count"</span>]</span>
+<span id="cb70-366"><a href="#cb70-366" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-367"><a href="#cb70-367" aria-hidden="true" tabindex="-1"></a><span class="co"># Determine the max number of Jennifers born in a year and the number born in 2022 </span></span>
+<span id="cb70-368"><a href="#cb70-368" aria-hidden="true" tabindex="-1"></a><span class="co"># to calculate RTP</span></span>
+<span id="cb70-369"><a href="#cb70-369" aria-hidden="true" tabindex="-1"></a>max_jenn <span class="op">=</span> <span class="bu">max</span>(f_babynames[f_babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Jennifer"</span>][<span class="st">"Count"</span>])</span>
+<span id="cb70-370"><a href="#cb70-370" aria-hidden="true" tabindex="-1"></a>curr_jenn <span class="op">=</span> f_babynames[f_babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Jennifer"</span>][<span class="st">"Count"</span>].iloc[<span class="op">-</span><span class="dv">1</span>]</span>
+<span id="cb70-371"><a href="#cb70-371" aria-hidden="true" tabindex="-1"></a>rtp <span class="op">=</span> curr_jenn <span class="op">/</span> max_jenn</span>
+<span id="cb70-372"><a href="#cb70-372" aria-hidden="true" tabindex="-1"></a>rtp</span>
+<span id="cb70-373"><a href="#cb70-373" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-374"><a href="#cb70-374" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-375"><a href="#cb70-375" aria-hidden="true" tabindex="-1"></a>By creating a function to calculate RTP and applying it to our <span class="in">`DataFrame`</span> by using <span class="in">`.groupby()`</span>, we can easily compute the RTP for all names at once! </span>
+<span id="cb70-376"><a href="#cb70-376" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-379"><a href="#cb70-379" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-380"><a href="#cb70-380" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-381"><a href="#cb70-381" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-382"><a href="#cb70-382" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> ratio_to_peak(series):</span>
+<span id="cb70-383"><a href="#cb70-383" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> series.iloc[<span class="op">-</span><span class="dv">1</span>] <span class="op">/</span> <span class="bu">max</span>(series)</span>
+<span id="cb70-384"><a href="#cb70-384" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-385"><a href="#cb70-385" aria-hidden="true" tabindex="-1"></a><span class="co">#Using .groupby() to apply the function</span></span>
+<span id="cb70-386"><a href="#cb70-386" aria-hidden="true" tabindex="-1"></a>rtp_table <span class="op">=</span> f_babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Year"</span>, <span class="st">"Count"</span>]].agg(ratio_to_peak)</span>
+<span id="cb70-387"><a href="#cb70-387" aria-hidden="true" tabindex="-1"></a>rtp_table.head()</span>
+<span id="cb70-388"><a href="#cb70-388" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-389"><a href="#cb70-389" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-390"><a href="#cb70-390" aria-hidden="true" tabindex="-1"></a>In the rows shown above, we can see that every row shown has a <span class="in">`Year`</span> value of <span class="in">`1.0`</span>.</span>
+<span id="cb70-391"><a href="#cb70-391" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-392"><a href="#cb70-392" aria-hidden="true" tabindex="-1"></a>This is the "**`pandas`**-ification" of logic you saw in Data 8. Much of the logic you've learned in Data 8 will serve you well in Data 100.</span>
+<span id="cb70-393"><a href="#cb70-393" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-394"><a href="#cb70-394" aria-hidden="true" tabindex="-1"></a><span class="fu">### Nuisance Columns</span></span>
+<span id="cb70-395"><a href="#cb70-395" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-396"><a href="#cb70-396" aria-hidden="true" tabindex="-1"></a>Note that you must be careful with which columns you apply the <span class="in">`.agg()`</span> function to. If we were to apply our function to the table as a whole by doing <span class="in">`f_babynames.groupby("Name").agg(ratio_to_peak)`</span>, executing our <span class="in">`.agg()`</span> call would result in a <span class="in">`TypeError`</span>.</span>
+<span id="cb70-397"><a href="#cb70-397" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-398"><a href="#cb70-398" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/error.png" alt='error' width='600'&gt;</span>
+<span id="cb70-399"><a href="#cb70-399" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-400"><a href="#cb70-400" aria-hidden="true" tabindex="-1"></a>We can avoid this issue (and prevent unintentional loss of data) by explicitly selecting column(s) we want to apply our aggregation function to **BEFORE** calling <span class="in">`.agg()`</span>, </span>
+<span id="cb70-401"><a href="#cb70-401" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-402"><a href="#cb70-402" aria-hidden="true" tabindex="-1"></a><span class="fu">### Renaming Columns After Grouping</span></span>
+<span id="cb70-403"><a href="#cb70-403" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-404"><a href="#cb70-404" aria-hidden="true" tabindex="-1"></a>By default, <span class="in">`.groupby`</span> will not rename any aggregated columns. As we can see in the table above, the aggregated column is still named <span class="in">`Count`</span> even though it now represents the RTP. For better readability, we can rename <span class="in">`Count`</span> to <span class="in">`Count RTP`</span></span>
+<span id="cb70-405"><a href="#cb70-405" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-408"><a href="#cb70-408" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-409"><a href="#cb70-409" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-410"><a href="#cb70-410" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-411"><a href="#cb70-411" aria-hidden="true" tabindex="-1"></a>rtp_table <span class="op">=</span> rtp_table.rename(columns <span class="op">=</span> {<span class="st">"Count"</span>: <span class="st">"Count RTP"</span>})</span>
+<span id="cb70-412"><a href="#cb70-412" aria-hidden="true" tabindex="-1"></a>rtp_table</span>
+<span id="cb70-413"><a href="#cb70-413" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-414"><a href="#cb70-414" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-415"><a href="#cb70-415" aria-hidden="true" tabindex="-1"></a><span class="fu">### Some Data Science Payoff</span></span>
+<span id="cb70-416"><a href="#cb70-416" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-417"><a href="#cb70-417" aria-hidden="true" tabindex="-1"></a>By sorting <span class="in">`rtp_table`</span>, we can see the names whose popularity has decreased the most.</span>
+<span id="cb70-418"><a href="#cb70-418" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-421"><a href="#cb70-421" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-422"><a href="#cb70-422" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-423"><a href="#cb70-423" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-424"><a href="#cb70-424" aria-hidden="true" tabindex="-1"></a>rtp_table <span class="op">=</span> rtp_table.rename(columns <span class="op">=</span> {<span class="st">"Count"</span>: <span class="st">"Count RTP"</span>})</span>
+<span id="cb70-425"><a href="#cb70-425" aria-hidden="true" tabindex="-1"></a>rtp_table.sort_values(<span class="st">"Count RTP"</span>).head()</span>
+<span id="cb70-426"><a href="#cb70-426" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-427"><a href="#cb70-427" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-428"><a href="#cb70-428" aria-hidden="true" tabindex="-1"></a>To visualize the above <span class="in">`DataFrame`</span>, let's look at the line plot below:</span>
+<span id="cb70-429"><a href="#cb70-429" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-432"><a href="#cb70-432" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-433"><a href="#cb70-433" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb70-434"><a href="#cb70-434" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-435"><a href="#cb70-435" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.express <span class="im">as</span> px</span>
+<span id="cb70-436"><a href="#cb70-436" aria-hidden="true" tabindex="-1"></a>px.line(f_babynames[f_babynames[<span class="st">"Name"</span>] <span class="op">==</span> <span class="st">"Debra"</span>], x <span class="op">=</span> <span class="st">"Year"</span>, y <span class="op">=</span> <span class="st">"Count"</span>)</span>
+<span id="cb70-437"><a href="#cb70-437" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-438"><a href="#cb70-438" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-439"><a href="#cb70-439" aria-hidden="true" tabindex="-1"></a>We can get the list of the top 10 names and then plot popularity with the following code:</span>
+<span id="cb70-440"><a href="#cb70-440" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-443"><a href="#cb70-443" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-444"><a href="#cb70-444" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-445"><a href="#cb70-445" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-446"><a href="#cb70-446" aria-hidden="true" tabindex="-1"></a>top10 <span class="op">=</span> rtp_table.sort_values(<span class="st">"Count RTP"</span>).head(<span class="dv">10</span>).index</span>
+<span id="cb70-447"><a href="#cb70-447" aria-hidden="true" tabindex="-1"></a>px.line(</span>
+<span id="cb70-448"><a href="#cb70-448" aria-hidden="true" tabindex="-1"></a>    f_babynames[f_babynames[<span class="st">"Name"</span>].isin(top10)], </span>
+<span id="cb70-449"><a href="#cb70-449" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> <span class="st">"Year"</span>, </span>
+<span id="cb70-450"><a href="#cb70-450" aria-hidden="true" tabindex="-1"></a>    y <span class="op">=</span> <span class="st">"Count"</span>, </span>
+<span id="cb70-451"><a href="#cb70-451" aria-hidden="true" tabindex="-1"></a>    color <span class="op">=</span> <span class="st">"Name"</span></span>
+<span id="cb70-452"><a href="#cb70-452" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb70-453"><a href="#cb70-453" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-454"><a href="#cb70-454" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-455"><a href="#cb70-455" aria-hidden="true" tabindex="-1"></a>As a quick exercise, consider what code would compute the total number of babies with each name. </span>
+<span id="cb70-456"><a href="#cb70-456" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-459"><a href="#cb70-459" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-460"><a href="#cb70-460" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb70-461"><a href="#cb70-461" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-462"><a href="#cb70-462" aria-hidden="true" tabindex="-1"></a>babynames.groupby(<span class="st">"Name"</span>)[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>).head()</span>
+<span id="cb70-463"><a href="#cb70-463" aria-hidden="true" tabindex="-1"></a><span class="co"># alternative solution: </span></span>
+<span id="cb70-464"><a href="#cb70-464" aria-hidden="true" tabindex="-1"></a><span class="co"># babynames.groupby("Name")[["Count"]].sum()</span></span>
+<span id="cb70-465"><a href="#cb70-465" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-466"><a href="#cb70-466" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-467"><a href="#cb70-467" aria-hidden="true" tabindex="-1"></a><span class="fu">## `.groupby()`, Continued</span></span>
+<span id="cb70-468"><a href="#cb70-468" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-469"><a href="#cb70-469" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-470"><a href="#cb70-470" aria-hidden="true" tabindex="-1"></a>We'll work with the <span class="in">`elections`</span> <span class="in">`DataFrame`</span> again.</span>
+<span id="cb70-471"><a href="#cb70-471" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-474"><a href="#cb70-474" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-475"><a href="#cb70-475" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb70-476"><a href="#cb70-476" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-477"><a href="#cb70-477" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-478"><a href="#cb70-478" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb70-479"><a href="#cb70-479" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb70-480"><a href="#cb70-480" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-481"><a href="#cb70-481" aria-hidden="true" tabindex="-1"></a>elections <span class="op">=</span> pd.read_csv(<span class="st">"data/elections.csv"</span>)</span>
+<span id="cb70-482"><a href="#cb70-482" aria-hidden="true" tabindex="-1"></a>elections.head(<span class="dv">5</span>)</span>
+<span id="cb70-483"><a href="#cb70-483" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-484"><a href="#cb70-484" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-485"><a href="#cb70-485" aria-hidden="true" tabindex="-1"></a><span class="fu">### Raw `GroupBy` Objects</span></span>
+<span id="cb70-486"><a href="#cb70-486" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-487"><a href="#cb70-487" aria-hidden="true" tabindex="-1"></a>The result of <span class="in">`groupby`</span> applied to a <span class="in">`DataFrame`</span> is a <span class="in">`DataFrameGroupBy`</span> object, **not** a <span class="in">`DataFrame`</span>.</span>
+<span id="cb70-488"><a href="#cb70-488" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-491"><a href="#cb70-491" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-492"><a href="#cb70-492" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-493"><a href="#cb70-493" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-494"><a href="#cb70-494" aria-hidden="true" tabindex="-1"></a>grouped_by_year <span class="op">=</span> elections.groupby(<span class="st">"Year"</span>)</span>
+<span id="cb70-495"><a href="#cb70-495" aria-hidden="true" tabindex="-1"></a><span class="bu">type</span>(grouped_by_year)</span>
+<span id="cb70-496"><a href="#cb70-496" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-497"><a href="#cb70-497" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-498"><a href="#cb70-498" aria-hidden="true" tabindex="-1"></a>There are several ways to look into <span class="in">`DataFrameGroupBy`</span> objects: </span>
+<span id="cb70-499"><a href="#cb70-499" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-502"><a href="#cb70-502" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-503"><a href="#cb70-503" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-504"><a href="#cb70-504" aria-hidden="true" tabindex="-1"></a>grouped_by_party <span class="op">=</span> elections.groupby(<span class="st">"Party"</span>)</span>
+<span id="cb70-505"><a href="#cb70-505" aria-hidden="true" tabindex="-1"></a>grouped_by_party.groups</span>
+<span id="cb70-506"><a href="#cb70-506" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-507"><a href="#cb70-507" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-510"><a href="#cb70-510" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-511"><a href="#cb70-511" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-512"><a href="#cb70-512" aria-hidden="true" tabindex="-1"></a>grouped_by_party.get_group(<span class="st">"Socialist"</span>)</span>
+<span id="cb70-513"><a href="#cb70-513" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-514"><a href="#cb70-514" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-515"><a href="#cb70-515" aria-hidden="true" tabindex="-1"></a><span class="fu">### Other `GroupBy` Methods</span></span>
+<span id="cb70-516"><a href="#cb70-516" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-517"><a href="#cb70-517" aria-hidden="true" tabindex="-1"></a>There are many aggregation methods we can use with <span class="in">`.agg`</span>. Some useful options are:</span>
+<span id="cb70-518"><a href="#cb70-518" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-519"><a href="#cb70-519" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="co">[</span><span class="ot">`.mean`</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.mean.html#pandas.core.groupby.DataFrameGroupBy.mean)</span>: creates a new <span class="in">`DataFrame`</span> with the mean value of each group</span>
+<span id="cb70-520"><a href="#cb70-520" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="co">[</span><span class="ot">`.sum`</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.sum.html#pandas.core.groupby.DataFrameGroupBy.sum)</span>: creates a new <span class="in">`DataFrame`</span> with the sum of each group</span>
+<span id="cb70-521"><a href="#cb70-521" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="co">[</span><span class="ot">`.max`</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.max.html#pandas.core.groupby.DataFrameGroupBy.max)</span> and <span class="co">[</span><span class="ot">`.min`</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.min.html#pandas.core.groupby.DataFrameGroupBy.min)</span>: creates a new <span class="in">`DataFrame`</span> with the maximum/minimum value of each group</span>
+<span id="cb70-522"><a href="#cb70-522" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="co">[</span><span class="ot">`.first`</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.first.html#pandas.core.groupby.DataFrameGroupBy.first)</span> and <span class="co">[</span><span class="ot">`.last`</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.last.html#pandas.core.groupby.DataFrameGroupBy.last)</span>: creates a new <span class="in">`DataFrame`</span> with the first/last row in each group</span>
+<span id="cb70-523"><a href="#cb70-523" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="co">[</span><span class="ot">`.size`</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.size.html#pandas.core.groupby.DataFrameGroupBy.size)</span>: creates a new **`Series`** with the number of entries in each group</span>
+<span id="cb70-524"><a href="#cb70-524" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="co">[</span><span class="ot">`.count`</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.count.html#pandas.core.groupby.DataFrameGroupBy.count)</span>: creates a new **`DataFrame`** with the number of entries, excluding missing values. </span>
+<span id="cb70-525"><a href="#cb70-525" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-526"><a href="#cb70-526" aria-hidden="true" tabindex="-1"></a>Let's illustrate some examples by creating a <span class="in">`DataFrame`</span> called <span class="in">`df`</span>. </span>
+<span id="cb70-527"><a href="#cb70-527" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-530"><a href="#cb70-530" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-531"><a href="#cb70-531" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-532"><a href="#cb70-532" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> pd.DataFrame({<span class="st">'letter'</span>:[<span class="st">'A'</span>,<span class="st">'A'</span>,<span class="st">'B'</span>,<span class="st">'C'</span>,<span class="st">'C'</span>,<span class="st">'C'</span>], </span>
+<span id="cb70-533"><a href="#cb70-533" aria-hidden="true" tabindex="-1"></a>                   <span class="st">'num'</span>:[<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">4</span>,np.nan,<span class="dv">4</span>], </span>
+<span id="cb70-534"><a href="#cb70-534" aria-hidden="true" tabindex="-1"></a>                   <span class="st">'state'</span>:[np.nan, <span class="st">'tx'</span>, <span class="st">'fl'</span>, <span class="st">'hi'</span>, np.nan, <span class="st">'ak'</span>]})</span>
+<span id="cb70-535"><a href="#cb70-535" aria-hidden="true" tabindex="-1"></a>df</span>
+<span id="cb70-536"><a href="#cb70-536" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-537"><a href="#cb70-537" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-538"><a href="#cb70-538" aria-hidden="true" tabindex="-1"></a>Note the slight difference between <span class="in">`.size()`</span> and <span class="in">`.count()`</span>: while <span class="in">`.size()`</span> returns a <span class="in">`Series`</span> and counts the number of entries including the missing values, <span class="in">`.count()`</span> returns a <span class="in">`DataFrame`</span> and counts the number of entries in each column *excluding missing values*. </span>
+<span id="cb70-539"><a href="#cb70-539" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-542"><a href="#cb70-542" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-543"><a href="#cb70-543" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-544"><a href="#cb70-544" aria-hidden="true" tabindex="-1"></a>df.groupby(<span class="st">"letter"</span>).size()</span>
+<span id="cb70-545"><a href="#cb70-545" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-546"><a href="#cb70-546" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-549"><a href="#cb70-549" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-550"><a href="#cb70-550" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-551"><a href="#cb70-551" aria-hidden="true" tabindex="-1"></a>df.groupby(<span class="st">"letter"</span>).count()</span>
+<span id="cb70-552"><a href="#cb70-552" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-553"><a href="#cb70-553" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-554"><a href="#cb70-554" aria-hidden="true" tabindex="-1"></a>You might recall that the <span class="in">`value_counts()`</span> function in the previous note does something similar. It turns out <span class="in">`value_counts()`</span> and <span class="in">`groupby.size()`</span> are the same, except <span class="in">`value_counts()`</span> sorts the resulting <span class="in">`Series`</span> in descending order automatically.</span>
+<span id="cb70-555"><a href="#cb70-555" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-558"><a href="#cb70-558" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-559"><a href="#cb70-559" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-560"><a href="#cb70-560" aria-hidden="true" tabindex="-1"></a>df[<span class="st">"letter"</span>].value_counts()</span>
+<span id="cb70-561"><a href="#cb70-561" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-562"><a href="#cb70-562" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-563"><a href="#cb70-563" aria-hidden="true" tabindex="-1"></a>These (and other) aggregation functions are so common that <span class="in">`pandas`</span> allows for writing shorthand. Instead of explicitly stating the use of <span class="in">`.agg`</span>, we can call the function directly on the <span class="in">`GroupBy`</span> object.</span>
+<span id="cb70-564"><a href="#cb70-564" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-565"><a href="#cb70-565" aria-hidden="true" tabindex="-1"></a>For example, the following are equivalent:</span>
+<span id="cb70-566"><a href="#cb70-566" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-567"><a href="#cb70-567" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span><span class="in">`elections.groupby("Candidate").agg(mean)`</span> </span>
+<span id="cb70-568"><a href="#cb70-568" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span><span class="in">`elections.groupby("Candidate").mean()`</span></span>
+<span id="cb70-569"><a href="#cb70-569" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-570"><a href="#cb70-570" aria-hidden="true" tabindex="-1"></a>There are many other methods that <span class="in">`pandas`</span> supports. You can check them out on the <span class="co">[</span><span class="ot">`pandas` documentation</span><span class="co">](https://pandas.pydata.org/docs/reference/groupby.html)</span>.</span>
+<span id="cb70-571"><a href="#cb70-571" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-572"><a href="#cb70-572" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-573"><a href="#cb70-573" aria-hidden="true" tabindex="-1"></a><span class="fu">### Filtering by Group</span></span>
+<span id="cb70-574"><a href="#cb70-574" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-575"><a href="#cb70-575" aria-hidden="true" tabindex="-1"></a>Another common use for <span class="in">`GroupBy`</span> objects is to filter data by group. </span>
+<span id="cb70-576"><a href="#cb70-576" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-577"><a href="#cb70-577" aria-hidden="true" tabindex="-1"></a><span class="in">`groupby.filter`</span> takes an argument <span class="in">`func`</span>, where <span class="in">`func`</span> is a function that:</span>
+<span id="cb70-578"><a href="#cb70-578" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-579"><a href="#cb70-579" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Takes a <span class="in">`DataFrame`</span> object as input</span>
+<span id="cb70-580"><a href="#cb70-580" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Returns a single <span class="in">`True`</span> or <span class="in">`False`</span>.</span>
+<span id="cb70-581"><a href="#cb70-581" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-582"><a href="#cb70-582" aria-hidden="true" tabindex="-1"></a><span class="in">`groupby.filter`</span> applies <span class="in">`func`</span> to each group/sub-<span class="in">`DataFrame`</span>:</span>
+<span id="cb70-583"><a href="#cb70-583" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-584"><a href="#cb70-584" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>If <span class="in">`func`</span> returns <span class="in">`True`</span> for a group, then all rows belonging to the group are preserved.</span>
+<span id="cb70-585"><a href="#cb70-585" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>If <span class="in">`func`</span> returns <span class="in">`False`</span> for a group, then all rows belonging to that group are filtered out.</span>
+<span id="cb70-586"><a href="#cb70-586" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-587"><a href="#cb70-587" aria-hidden="true" tabindex="-1"></a>In other words, sub-<span class="in">`DataFrame`</span>s that correspond to <span class="in">`True`</span> are returned in the final result, whereas those with a <span class="in">`False`</span> value are not. Importantly, <span class="in">`groupby.filter`</span> is different from <span class="in">`groupby.agg`</span> in that an *entire* sub-`DataFrame` is returned in the final `DataFrame`, not just a single row. As a result, `groupby.filter` preserves the original indices and the column we grouped on does **NOT** become the index!</span>
+<span id="cb70-588"><a href="#cb70-588" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-589"><a href="#cb70-589" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/filter_demo.png" alt='groupby_demo' width='600'&gt;</span>
+<span id="cb70-590"><a href="#cb70-590" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-591"><a href="#cb70-591" aria-hidden="true" tabindex="-1"></a>To illustrate how this happens, let's go back to the <span class="in">`elections`</span> dataset. Say we want to identify "tight" election years – that is, we want to find all rows that correspond to election years where all candidates in that year won a similar portion of the total vote. Specifically, let's find all rows corresponding to a year where no candidate won more than 45% of the total vote. </span>
+<span id="cb70-592"><a href="#cb70-592" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-593"><a href="#cb70-593" aria-hidden="true" tabindex="-1"></a>In other words, we want to: </span>
+<span id="cb70-594"><a href="#cb70-594" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-595"><a href="#cb70-595" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Find the years where the maximum <span class="in">`%`</span> in that year is less than 45%</span>
+<span id="cb70-596"><a href="#cb70-596" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Return all <span class="in">`DataFrame`</span> rows that correspond to these years</span>
+<span id="cb70-597"><a href="#cb70-597" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-598"><a href="#cb70-598" aria-hidden="true" tabindex="-1"></a>For each year, we need to find the maximum <span class="in">`%`</span> among *all* rows for that year. If this maximum <span class="in">`%`</span> is lower than 45%, we will tell <span class="in">`pandas`</span> to keep all rows corresponding to that year. </span>
+<span id="cb70-599"><a href="#cb70-599" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-602"><a href="#cb70-602" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-603"><a href="#cb70-603" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-604"><a href="#cb70-604" aria-hidden="true" tabindex="-1"></a>elections.groupby(<span class="st">"Year"</span>).<span class="bu">filter</span>(<span class="kw">lambda</span> sf: sf[<span class="st">"%"</span>].<span class="bu">max</span>() <span class="op">&lt;</span> <span class="dv">45</span>).head(<span class="dv">9</span>)</span>
+<span id="cb70-605"><a href="#cb70-605" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-606"><a href="#cb70-606" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-607"><a href="#cb70-607" aria-hidden="true" tabindex="-1"></a>What's going on here? In this example, we've defined our filtering function, <span class="in">`func`</span>, to be <span class="in">`lambda sf: sf["%"].max() &lt; 45`</span>. This filtering function will find the maximum <span class="in">`"%"`</span> value among all entries in the grouped sub-<span class="in">`DataFrame`</span>, which we call <span class="in">`sf`</span>. If the maximum value is less than 45, then the filter function will return <span class="in">`True`</span> and all rows in that grouped sub-<span class="in">`DataFrame`</span> will appear in the final output <span class="in">`DataFrame`</span>. </span>
+<span id="cb70-608"><a href="#cb70-608" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-609"><a href="#cb70-609" aria-hidden="true" tabindex="-1"></a>Examine the <span class="in">`DataFrame`</span> above. Notice how, in this preview of the first 9 rows, all entries from the years 1860 and 1912 appear. This means that in 1860 and 1912, no candidate in that year won more than 45% of the total vote. </span>
+<span id="cb70-610"><a href="#cb70-610" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-611"><a href="#cb70-611" aria-hidden="true" tabindex="-1"></a>You may ask: how is the <span class="in">`groupby.filter`</span> procedure different to the boolean filtering we've seen previously? Boolean filtering considers *individual* rows when applying a boolean condition. For example, the code `elections[elections["%"] &lt; 45]` will check the `"%"` value of every single row in `elections`; if it is less than 45, then that row will be kept in the output. `groupby.filter`, in contrast, applies a boolean condition *across* all rows in a group. If not all rows in that group satisfy the condition specified by the filter, the entire group will be discarded in the output. </span>
+<span id="cb70-612"><a href="#cb70-612" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-613"><a href="#cb70-613" aria-hidden="true" tabindex="-1"></a><span class="fu">### Aggregation with `lambda` Functions</span></span>
+<span id="cb70-614"><a href="#cb70-614" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-615"><a href="#cb70-615" aria-hidden="true" tabindex="-1"></a>What if we wish to aggregate our <span class="in">`DataFrame`</span> using a non-standard function – for example, a function of our own design? We can do so by combining <span class="in">`.agg`</span> with <span class="in">`lambda`</span> expressions.</span>
+<span id="cb70-616"><a href="#cb70-616" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-617"><a href="#cb70-617" aria-hidden="true" tabindex="-1"></a>Let's first consider a puzzle to jog our memory. We will attempt to find the <span class="in">`Candidate`</span> from each <span class="in">`Party`</span> with the highest <span class="in">`%`</span> of votes. </span>
+<span id="cb70-618"><a href="#cb70-618" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-619"><a href="#cb70-619" aria-hidden="true" tabindex="-1"></a>A naive approach may be to group by the <span class="in">`Party`</span> column and aggregate by the maximum.</span>
+<span id="cb70-620"><a href="#cb70-620" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-623"><a href="#cb70-623" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-624"><a href="#cb70-624" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-625"><a href="#cb70-625" aria-hidden="true" tabindex="-1"></a>elections.groupby(<span class="st">"Party"</span>).agg(<span class="bu">max</span>).head(<span class="dv">10</span>)</span>
+<span id="cb70-626"><a href="#cb70-626" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-627"><a href="#cb70-627" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-628"><a href="#cb70-628" aria-hidden="true" tabindex="-1"></a>This approach is clearly wrong – the <span class="in">`DataFrame`</span> claims that Woodrow Wilson won the presidency in 2020.</span>
+<span id="cb70-629"><a href="#cb70-629" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-630"><a href="#cb70-630" aria-hidden="true" tabindex="-1"></a>Why is this happening? Here, the <span class="in">`max`</span> aggregation function is taken over every column *independently*. Among Democrats, <span class="in">`max`</span> is computing:</span>
+<span id="cb70-631"><a href="#cb70-631" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-632"><a href="#cb70-632" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The most recent <span class="in">`Year`</span> a Democratic candidate ran for president (2020)</span>
+<span id="cb70-633"><a href="#cb70-633" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The <span class="in">`Candidate`</span> with the alphabetically "largest" name ("Woodrow Wilson")</span>
+<span id="cb70-634"><a href="#cb70-634" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The <span class="in">`Result`</span> with the alphabetically "largest" outcome ("win")</span>
+<span id="cb70-635"><a href="#cb70-635" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-636"><a href="#cb70-636" aria-hidden="true" tabindex="-1"></a>Instead, let's try a different approach. We will:</span>
+<span id="cb70-637"><a href="#cb70-637" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-638"><a href="#cb70-638" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Sort the <span class="in">`DataFrame`</span> so that rows are in descending order of <span class="in">`%`</span></span>
+<span id="cb70-639"><a href="#cb70-639" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Group by <span class="in">`Party`</span> and select the first row of each sub-<span class="in">`DataFrame`</span></span>
+<span id="cb70-640"><a href="#cb70-640" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-641"><a href="#cb70-641" aria-hidden="true" tabindex="-1"></a>While it may seem unintuitive, sorting <span class="in">`elections`</span> by descending order of <span class="in">`%`</span> is extremely helpful. If we then group by <span class="in">`Party`</span>, the first row of each <span class="in">`GroupBy`</span> object will contain information about the <span class="in">`Candidate`</span> with the highest voter <span class="in">`%`</span>.</span>
+<span id="cb70-642"><a href="#cb70-642" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-645"><a href="#cb70-645" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-646"><a href="#cb70-646" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-647"><a href="#cb70-647" aria-hidden="true" tabindex="-1"></a>elections_sorted_by_percent <span class="op">=</span> elections.sort_values(<span class="st">"%"</span>, ascending<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb70-648"><a href="#cb70-648" aria-hidden="true" tabindex="-1"></a>elections_sorted_by_percent.head(<span class="dv">5</span>)</span>
+<span id="cb70-649"><a href="#cb70-649" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-650"><a href="#cb70-650" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-653"><a href="#cb70-653" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-654"><a href="#cb70-654" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-655"><a href="#cb70-655" aria-hidden="true" tabindex="-1"></a>elections_sorted_by_percent.groupby(<span class="st">"Party"</span>).agg(<span class="kw">lambda</span> x : x.iloc[<span class="dv">0</span>]).head(<span class="dv">10</span>)</span>
+<span id="cb70-656"><a href="#cb70-656" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-657"><a href="#cb70-657" aria-hidden="true" tabindex="-1"></a><span class="co"># Equivalent to the below code</span></span>
+<span id="cb70-658"><a href="#cb70-658" aria-hidden="true" tabindex="-1"></a><span class="co"># elections_sorted_by_percent.groupby("Party").agg('first').head(10)</span></span>
+<span id="cb70-659"><a href="#cb70-659" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-660"><a href="#cb70-660" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-661"><a href="#cb70-661" aria-hidden="true" tabindex="-1"></a>Here's an illustration of the process:</span>
+<span id="cb70-662"><a href="#cb70-662" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-663"><a href="#cb70-663" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/puzzle_demo.png" alt='groupby_demo' width='600'&gt;</span>
+<span id="cb70-664"><a href="#cb70-664" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-665"><a href="#cb70-665" aria-hidden="true" tabindex="-1"></a>Notice how our code correctly determines that Lyndon Johnson from the Democratic Party has the highest voter <span class="in">`%`</span>.</span>
+<span id="cb70-666"><a href="#cb70-666" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-667"><a href="#cb70-667" aria-hidden="true" tabindex="-1"></a>More generally, <span class="in">`lambda`</span> functions are used to design custom aggregation functions that aren't pre-defined by Python. The input parameter <span class="in">`x`</span> to the <span class="in">`lambda`</span> function is a <span class="in">`GroupBy`</span> object. Therefore, it should make sense why <span class="in">`lambda x : x.iloc[0]`</span> selects the first row in each groupby object.</span>
+<span id="cb70-668"><a href="#cb70-668" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-669"><a href="#cb70-669" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-670"><a href="#cb70-670" aria-hidden="true" tabindex="-1"></a>In fact, there's a few different ways to approach this problem. Each approach has different tradeoffs in terms of readability, performance, memory consumption, complexity, etc. We've given a few examples below. </span>
+<span id="cb70-671"><a href="#cb70-671" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-672"><a href="#cb70-672" aria-hidden="true" tabindex="-1"></a>**Note**: Understanding these alternative solutions is not required. They are given to demonstrate the vast number of problem-solving approaches in <span class="in">`pandas`</span>.</span>
+<span id="cb70-673"><a href="#cb70-673" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-676"><a href="#cb70-676" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-677"><a href="#cb70-677" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-678"><a href="#cb70-678" aria-hidden="true" tabindex="-1"></a><span class="co"># Using the idxmax function</span></span>
+<span id="cb70-679"><a href="#cb70-679" aria-hidden="true" tabindex="-1"></a>best_per_party <span class="op">=</span> elections.loc[elections.groupby(<span class="st">'Party'</span>)[<span class="st">'%'</span>].idxmax()]</span>
+<span id="cb70-680"><a href="#cb70-680" aria-hidden="true" tabindex="-1"></a>best_per_party.head(<span class="dv">5</span>)</span>
+<span id="cb70-681"><a href="#cb70-681" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-682"><a href="#cb70-682" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-685"><a href="#cb70-685" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-686"><a href="#cb70-686" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-687"><a href="#cb70-687" aria-hidden="true" tabindex="-1"></a><span class="co"># Using the .drop_duplicates function</span></span>
+<span id="cb70-688"><a href="#cb70-688" aria-hidden="true" tabindex="-1"></a>best_per_party2 <span class="op">=</span> elections.sort_values(<span class="st">'%'</span>).drop_duplicates([<span class="st">'Party'</span>], keep<span class="op">=</span><span class="st">'last'</span>)</span>
+<span id="cb70-689"><a href="#cb70-689" aria-hidden="true" tabindex="-1"></a>best_per_party2.head(<span class="dv">5</span>)</span>
+<span id="cb70-690"><a href="#cb70-690" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-691"><a href="#cb70-691" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-692"><a href="#cb70-692" aria-hidden="true" tabindex="-1"></a><span class="fu">## Aggregating Data with Pivot Tables</span></span>
+<span id="cb70-693"><a href="#cb70-693" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-694"><a href="#cb70-694" aria-hidden="true" tabindex="-1"></a>We know now that <span class="in">`.groupby`</span> gives us the ability to group and aggregate data across our <span class="in">`DataFrame`</span>. The examples above formed groups using just one column in the <span class="in">`DataFrame`</span>. It's possible to group by multiple columns at once by passing in a list of column names to <span class="in">`.groupby`</span>. </span>
+<span id="cb70-695"><a href="#cb70-695" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-696"><a href="#cb70-696" aria-hidden="true" tabindex="-1"></a>Let's consider the <span class="in">`babynames`</span> dataset again. In this problem, we will find the total number of baby names associated with each sex for each year. To do this, we'll group by *both* the <span class="in">`"Year"`</span> and <span class="in">`"Sex"`</span> columns.</span>
+<span id="cb70-697"><a href="#cb70-697" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-700"><a href="#cb70-700" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-701"><a href="#cb70-701" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-702"><a href="#cb70-702" aria-hidden="true" tabindex="-1"></a>babynames.head()</span>
+<span id="cb70-703"><a href="#cb70-703" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-704"><a href="#cb70-704" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-707"><a href="#cb70-707" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-708"><a href="#cb70-708" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-709"><a href="#cb70-709" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-710"><a href="#cb70-710" aria-hidden="true" tabindex="-1"></a><span class="co"># Find the total number of baby names associated with each sex for each </span></span>
+<span id="cb70-711"><a href="#cb70-711" aria-hidden="true" tabindex="-1"></a><span class="co"># year in the data</span></span>
+<span id="cb70-712"><a href="#cb70-712" aria-hidden="true" tabindex="-1"></a>babynames.groupby([<span class="st">"Year"</span>, <span class="st">"Sex"</span>])[[<span class="st">"Count"</span>]].agg(<span class="bu">sum</span>).head(<span class="dv">6</span>)</span>
+<span id="cb70-713"><a href="#cb70-713" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-714"><a href="#cb70-714" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-715"><a href="#cb70-715" aria-hidden="true" tabindex="-1"></a>Notice that both <span class="in">`"Year"`</span> and <span class="in">`"Sex"`</span> serve as the index of the <span class="in">`DataFrame`</span> (they are both rendered in bold). We've created a *multi-index* <span class="in">`DataFrame`</span> where two different index values, the year and sex, are used to uniquely identify each row. </span>
+<span id="cb70-716"><a href="#cb70-716" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-717"><a href="#cb70-717" aria-hidden="true" tabindex="-1"></a>This isn't the most intuitive way of representing this data – and, because multi-indexed DataFrames have multiple dimensions in their index, they can often be difficult to use. </span>
+<span id="cb70-718"><a href="#cb70-718" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-719"><a href="#cb70-719" aria-hidden="true" tabindex="-1"></a>Another strategy to aggregate across two columns is to create a pivot table. You saw these back in <span class="co">[</span><span class="ot">Data 8</span><span class="co">](https://inferentialthinking.com/chapters/08/3/Cross-Classifying_by_More_than_One_Variable.html#pivot-tables-rearranging-the-output-of-group)</span>. One set of values is used to create the index of the pivot table; another set is used to define the column names. The values contained in each cell of the table correspond to the aggregated data for each index-column pair.</span>
+<span id="cb70-720"><a href="#cb70-720" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-721"><a href="#cb70-721" aria-hidden="true" tabindex="-1"></a>Here's an illustration of the process: </span>
+<span id="cb70-722"><a href="#cb70-722" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-723"><a href="#cb70-723" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/pivot.png" alt='groupby_demo' width='600'&gt;</span>
+<span id="cb70-724"><a href="#cb70-724" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-725"><a href="#cb70-725" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-726"><a href="#cb70-726" aria-hidden="true" tabindex="-1"></a>The best way to understand pivot tables is to see one in action. Let's return to our original goal of summing the total number of names associated with each combination of year and sex. We'll call the <span class="in">`pandas`</span> <span class="co">[</span><span class="ot">`.pivot_table`</span><span class="co">](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html)</span> method to create a new table.</span>
+<span id="cb70-727"><a href="#cb70-727" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-730"><a href="#cb70-730" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-731"><a href="#cb70-731" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-732"><a href="#cb70-732" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-733"><a href="#cb70-733" aria-hidden="true" tabindex="-1"></a><span class="co"># The `pivot_table` method is used to generate a Pandas pivot table</span></span>
+<span id="cb70-734"><a href="#cb70-734" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb70-735"><a href="#cb70-735" aria-hidden="true" tabindex="-1"></a>babynames.pivot_table(</span>
+<span id="cb70-736"><a href="#cb70-736" aria-hidden="true" tabindex="-1"></a>    index <span class="op">=</span> <span class="st">"Year"</span>,</span>
+<span id="cb70-737"><a href="#cb70-737" aria-hidden="true" tabindex="-1"></a>    columns <span class="op">=</span> <span class="st">"Sex"</span>,    </span>
+<span id="cb70-738"><a href="#cb70-738" aria-hidden="true" tabindex="-1"></a>    values <span class="op">=</span> <span class="st">"Count"</span>, </span>
+<span id="cb70-739"><a href="#cb70-739" aria-hidden="true" tabindex="-1"></a>    aggfunc <span class="op">=</span> np.<span class="bu">sum</span>, </span>
+<span id="cb70-740"><a href="#cb70-740" aria-hidden="true" tabindex="-1"></a>).head(<span class="dv">5</span>)</span>
+<span id="cb70-741"><a href="#cb70-741" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-742"><a href="#cb70-742" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-743"><a href="#cb70-743" aria-hidden="true" tabindex="-1"></a>Looks a lot better! Now, our <span class="in">`DataFrame`</span> is structured with clear index-column combinations. Each entry in the pivot table represents the summed count of names for a given combination of <span class="in">`"Year"`</span> and <span class="in">`"Sex"`</span>.</span>
+<span id="cb70-744"><a href="#cb70-744" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-745"><a href="#cb70-745" aria-hidden="true" tabindex="-1"></a>Let's take a closer look at the code implemented above. </span>
+<span id="cb70-746"><a href="#cb70-746" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-747"><a href="#cb70-747" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`index = "Year"`</span> specifies the column name in the original <span class="in">`DataFrame`</span> that should be used as the index of the pivot table</span>
+<span id="cb70-748"><a href="#cb70-748" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`columns = "Sex"`</span> specifies the column name in the original <span class="in">`DataFrame`</span> that should be used to generate the columns of the pivot table</span>
+<span id="cb70-749"><a href="#cb70-749" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`values = "Count"`</span> indicates what values from the original <span class="in">`DataFrame`</span> should be used to populate the entry for each index-column combination</span>
+<span id="cb70-750"><a href="#cb70-750" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`aggfunc = np.sum`</span> tells <span class="in">`pandas`</span> what function to use when aggregating the data specified by <span class="in">`values`</span>. Here, we are summing the name counts for each pair of <span class="in">`"Year"`</span> and <span class="in">`"Sex"`</span></span>
+<span id="cb70-751"><a href="#cb70-751" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-752"><a href="#cb70-752" aria-hidden="true" tabindex="-1"></a>We can even include multiple values in the index or columns of our pivot tables. </span>
+<span id="cb70-753"><a href="#cb70-753" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-756"><a href="#cb70-756" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-757"><a href="#cb70-757" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-758"><a href="#cb70-758" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-759"><a href="#cb70-759" aria-hidden="true" tabindex="-1"></a>babynames_pivot <span class="op">=</span> babynames.pivot_table(</span>
+<span id="cb70-760"><a href="#cb70-760" aria-hidden="true" tabindex="-1"></a>    index<span class="op">=</span><span class="st">"Year"</span>,     <span class="co"># the rows (turned into index)</span></span>
+<span id="cb70-761"><a href="#cb70-761" aria-hidden="true" tabindex="-1"></a>    columns<span class="op">=</span><span class="st">"Sex"</span>,    <span class="co"># the column values</span></span>
+<span id="cb70-762"><a href="#cb70-762" aria-hidden="true" tabindex="-1"></a>    values<span class="op">=</span>[<span class="st">"Count"</span>, <span class="st">"Name"</span>], </span>
+<span id="cb70-763"><a href="#cb70-763" aria-hidden="true" tabindex="-1"></a>    aggfunc<span class="op">=</span><span class="bu">max</span>,      <span class="co"># group operation</span></span>
+<span id="cb70-764"><a href="#cb70-764" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb70-765"><a href="#cb70-765" aria-hidden="true" tabindex="-1"></a>babynames_pivot.head(<span class="dv">6</span>)</span>
+<span id="cb70-766"><a href="#cb70-766" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-767"><a href="#cb70-767" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-768"><a href="#cb70-768" aria-hidden="true" tabindex="-1"></a>Note that each row provides the number of girls and number of boys having that year's most common name, and also lists the alphabetically largest girl name and boy name. The counts for number of girls/boys in the resulting <span class="in">`DataFrame`</span> do not correspond to the names listed. For example, in 1910, the most popular girl name is given to 295 girls, but that name was likely not Yvonne. </span>
+<span id="cb70-769"><a href="#cb70-769" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-770"><a href="#cb70-770" aria-hidden="true" tabindex="-1"></a><span class="fu">## Joining Tables </span></span>
+<span id="cb70-771"><a href="#cb70-771" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-772"><a href="#cb70-772" aria-hidden="true" tabindex="-1"></a>When working on data science projects, we're unlikely to have absolutely all the data we want contained in a single <span class="in">`DataFrame`</span> – a real-world data scientist needs to grapple with data coming from multiple sources. If we have access to multiple datasets with related information, we can join two or more tables into a single <span class="in">`DataFrame`</span>. </span>
+<span id="cb70-773"><a href="#cb70-773" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-774"><a href="#cb70-774" aria-hidden="true" tabindex="-1"></a>To put this into practice, we'll revisit the <span class="in">`elections`</span> dataset.</span>
+<span id="cb70-775"><a href="#cb70-775" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-778"><a href="#cb70-778" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-779"><a href="#cb70-779" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-780"><a href="#cb70-780" aria-hidden="true" tabindex="-1"></a>elections.head(<span class="dv">5</span>)</span>
+<span id="cb70-781"><a href="#cb70-781" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-782"><a href="#cb70-782" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-783"><a href="#cb70-783" aria-hidden="true" tabindex="-1"></a>Say we want to understand the popularity of the names of each presidential candidate in 2022. To do this, we'll need the combined data of <span class="in">`babynames`</span> *and* <span class="in">`elections`</span>. </span>
+<span id="cb70-784"><a href="#cb70-784" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-785"><a href="#cb70-785" aria-hidden="true" tabindex="-1"></a>We'll start by creating a new column containing the first name of each presidential candidate. This will help us join each name in <span class="in">`elections`</span> to the corresponding name data in <span class="in">`babynames`</span>. </span>
+<span id="cb70-786"><a href="#cb70-786" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-789"><a href="#cb70-789" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-790"><a href="#cb70-790" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
+<span id="cb70-791"><a href="#cb70-791" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-792"><a href="#cb70-792" aria-hidden="true" tabindex="-1"></a><span class="co"># This `str` operation splits each candidate's full name at each </span></span>
+<span id="cb70-793"><a href="#cb70-793" aria-hidden="true" tabindex="-1"></a><span class="co"># blank space, then takes just the candidate's first name</span></span>
+<span id="cb70-794"><a href="#cb70-794" aria-hidden="true" tabindex="-1"></a>elections[<span class="st">"First Name"</span>] <span class="op">=</span> elections[<span class="st">"Candidate"</span>].<span class="bu">str</span>.split().<span class="bu">str</span>[<span class="dv">0</span>]</span>
+<span id="cb70-795"><a href="#cb70-795" aria-hidden="true" tabindex="-1"></a>elections.head(<span class="dv">5</span>)</span>
+<span id="cb70-796"><a href="#cb70-796" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-797"><a href="#cb70-797" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-800"><a href="#cb70-800" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-801"><a href="#cb70-801" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-802"><a href="#cb70-802" aria-hidden="true" tabindex="-1"></a><span class="co"># Here, we'll only consider `babynames` data from 2022</span></span>
+<span id="cb70-803"><a href="#cb70-803" aria-hidden="true" tabindex="-1"></a>babynames_2022 <span class="op">=</span> babynames[babynames[<span class="st">"Year"</span>]<span class="op">==</span><span class="dv">2022</span>]</span>
+<span id="cb70-804"><a href="#cb70-804" aria-hidden="true" tabindex="-1"></a>babynames_2022.head()</span>
+<span id="cb70-805"><a href="#cb70-805" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-806"><a href="#cb70-806" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-807"><a href="#cb70-807" aria-hidden="true" tabindex="-1"></a>Now, we're ready to join the two tables. <span class="co">[</span><span class="ot">`pd.merge`</span><span class="co">](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html)</span> is the <span class="in">`pandas`</span> method used to join <span class="in">`DataFrame`</span>s together.</span>
+<span id="cb70-808"><a href="#cb70-808" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-811"><a href="#cb70-811" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb70-812"><a href="#cb70-812" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb70-813"><a href="#cb70-813" aria-hidden="true" tabindex="-1"></a>merged <span class="op">=</span> pd.merge(left <span class="op">=</span> elections, right <span class="op">=</span> babynames_2022, <span class="op">\</span></span>
+<span id="cb70-814"><a href="#cb70-814" aria-hidden="true" tabindex="-1"></a>                  left_on <span class="op">=</span> <span class="st">"First Name"</span>, right_on <span class="op">=</span> <span class="st">"Name"</span>)</span>
+<span id="cb70-815"><a href="#cb70-815" aria-hidden="true" tabindex="-1"></a>merged.head()</span>
+<span id="cb70-816"><a href="#cb70-816" aria-hidden="true" tabindex="-1"></a><span class="co"># Notice that pandas automatically specifies `Year_x` and `Year_y` </span></span>
+<span id="cb70-817"><a href="#cb70-817" aria-hidden="true" tabindex="-1"></a><span class="co"># when both merged DataFrames have the same column name to avoid confusion</span></span>
+<span id="cb70-818"><a href="#cb70-818" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-819"><a href="#cb70-819" aria-hidden="true" tabindex="-1"></a><span class="co"># Second option</span></span>
+<span id="cb70-820"><a href="#cb70-820" aria-hidden="true" tabindex="-1"></a><span class="co"># merged = elections.merge(right = babynames_2022, \</span></span>
+<span id="cb70-821"><a href="#cb70-821" aria-hidden="true" tabindex="-1"></a>    <span class="co"># left_on = "First Name", right_on = "Name")</span></span>
+<span id="cb70-822"><a href="#cb70-822" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb70-823"><a href="#cb70-823" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-824"><a href="#cb70-824" aria-hidden="true" tabindex="-1"></a>Let's take a closer look at the parameters:</span>
+<span id="cb70-825"><a href="#cb70-825" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-826"><a href="#cb70-826" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`left`</span> and <span class="in">`right`</span> parameters are used to specify the <span class="in">`DataFrame`</span>s to be joined.</span>
+<span id="cb70-827"><a href="#cb70-827" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`left_on`</span> and <span class="in">`right_on`</span> parameters are assigned to the string names of the columns to be used when performing the join. These two <span class="in">`on`</span> parameters tell <span class="in">`pandas`</span> what values should act as pairing keys to determine which rows to merge across the <span class="in">`DataFrame`</span>s. We'll talk more about this idea of a pairing key next lecture.</span>
+<span id="cb70-828"><a href="#cb70-828" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-829"><a href="#cb70-829" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-830"><a href="#cb70-830" aria-hidden="true" tabindex="-1"></a><span class="fu">## Parting Note</span></span>
+<span id="cb70-831"><a href="#cb70-831" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-832"><a href="#cb70-832" aria-hidden="true" tabindex="-1"></a>Congratulations! We finally tackled <span class="in">`pandas`</span>. Don't worry if you are still not feeling very comfortable with it—you will have plenty of chances to practice over the next few weeks.</span>
+<span id="cb70-833"><a href="#cb70-833" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-834"><a href="#cb70-834" aria-hidden="true" tabindex="-1"></a>Next, we will get our hands dirty with some real-world datasets and use our <span class="in">`pandas`</span> knowledge to conduct some exploratory data analysis.</span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/pca_1/images/PCA_1.png b/docs/pca_1/images/PCA_1.png
new file mode 100644
index 000000000..d2e3b1310
Binary files /dev/null and b/docs/pca_1/images/PCA_1.png differ
diff --git a/docs/pca_1/images/dataset3.png b/docs/pca_1/images/dataset3.png
new file mode 100644
index 000000000..a9ab94fb4
Binary files /dev/null and b/docs/pca_1/images/dataset3.png differ
diff --git a/docs/pca_1/images/dataset3_outlier.png b/docs/pca_1/images/dataset3_outlier.png
new file mode 100644
index 000000000..a5d8fd045
Binary files /dev/null and b/docs/pca_1/images/dataset3_outlier.png differ
diff --git a/docs/pca_1/images/dataset4.png b/docs/pca_1/images/dataset4.png
new file mode 100644
index 000000000..f8da69964
Binary files /dev/null and b/docs/pca_1/images/dataset4.png differ
diff --git a/docs/pca_1/images/dataset_dims.png b/docs/pca_1/images/dataset_dims.png
new file mode 100644
index 000000000..b7b3cb5da
Binary files /dev/null and b/docs/pca_1/images/dataset_dims.png differ
diff --git a/docs/pca_1/images/diff_reductions.png b/docs/pca_1/images/diff_reductions.png
new file mode 100644
index 000000000..782f93fc1
Binary files /dev/null and b/docs/pca_1/images/diff_reductions.png differ
diff --git a/docs/pca_1/images/factorization.png b/docs/pca_1/images/factorization.png
new file mode 100644
index 000000000..000913a12
Binary files /dev/null and b/docs/pca_1/images/factorization.png differ
diff --git a/docs/pca_1/images/factorization_constraints.png b/docs/pca_1/images/factorization_constraints.png
new file mode 100644
index 000000000..0b80ff2df
Binary files /dev/null and b/docs/pca_1/images/factorization_constraints.png differ
diff --git a/docs/pca_1/images/matmul.png b/docs/pca_1/images/matmul.png
new file mode 100644
index 000000000..6c56dba85
Binary files /dev/null and b/docs/pca_1/images/matmul.png differ
diff --git a/docs/pca_1/images/matmul2.png b/docs/pca_1/images/matmul2.png
new file mode 100644
index 000000000..6b7c6edb3
Binary files /dev/null and b/docs/pca_1/images/matmul2.png differ
diff --git a/docs/pca_1/images/matmul3.png b/docs/pca_1/images/matmul3.png
new file mode 100644
index 000000000..35bc575d8
Binary files /dev/null and b/docs/pca_1/images/matmul3.png differ
diff --git a/docs/pca_1/images/matrix_decomp.png b/docs/pca_1/images/matrix_decomp.png
new file mode 100644
index 000000000..7e5a90ad7
Binary files /dev/null and b/docs/pca_1/images/matrix_decomp.png differ
diff --git a/docs/pca_1/images/optimization_takeaways.png b/docs/pca_1/images/optimization_takeaways.png
new file mode 100644
index 000000000..3b3da0ee9
Binary files /dev/null and b/docs/pca_1/images/optimization_takeaways.png differ
diff --git a/docs/pca_1/images/pca_example.png b/docs/pca_1/images/pca_example.png
new file mode 100644
index 000000000..825ca3b66
Binary files /dev/null and b/docs/pca_1/images/pca_example.png differ
diff --git a/docs/pca_1/images/reconstruction_loss.png b/docs/pca_1/images/reconstruction_loss.png
new file mode 100644
index 000000000..984246187
Binary files /dev/null and b/docs/pca_1/images/reconstruction_loss.png differ
diff --git a/docs/pca_1/pca_1.html b/docs/pca_1/pca_1.html
new file mode 100644
index 000000000..14ae6a28f
--- /dev/null
+++ b/docs/pca_1/pca_1.html
@@ -0,0 +1,1337 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>24&nbsp; PCA I – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../pca_2/pca_2.html" rel="next">
+<link href="../logistic_regression_2/logistic_reg_2.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+<script type="text/javascript">
+window.PlotlyConfig = {MathJaxConfig: 'local'};
+if (window.MathJax && window.MathJax.Hub && window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: "STIX-Web"}});}
+if (typeof require !== 'undefined') {
+require.undef("plotly");
+requirejs.config({
+    paths: {
+        'plotly': ['https://cdn.plot.ly/plotly-2.34.0.min']
+    }
+});
+require(['plotly'], function(Plotly) {
+    window._Plotly = Plotly;
+});
+}
+</script>
+
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../pca_1/pca_1.html"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#visualization-revisited" id="toc-visualization-revisited" class="nav-link active" data-scroll-target="#visualization-revisited"><span class="header-section-number">24.1</span> Visualization (Revisited)</a></li>
+  <li><a href="#dimensionality" id="toc-dimensionality" class="nav-link" data-scroll-target="#dimensionality"><span class="header-section-number">24.2</span> Dimensionality</a></li>
+  <li><a href="#matrix-decomposition-factorization" id="toc-matrix-decomposition-factorization" class="nav-link" data-scroll-target="#matrix-decomposition-factorization"><span class="header-section-number">24.3</span> Matrix Decomposition (Factorization)</a></li>
+  <li><a href="#principal-component-analysis-pca" id="toc-principal-component-analysis-pca" class="nav-link" data-scroll-target="#principal-component-analysis-pca"><span class="header-section-number">24.4</span> Principal Component Analysis (PCA)</a>
+  <ul>
+  <li><a href="#pca-procedure-overview" id="toc-pca-procedure-overview" class="nav-link" data-scroll-target="#pca-procedure-overview"><span class="header-section-number">24.4.1</span> PCA Procedure (Overview)</a></li>
+  <li><a href="#deriving-pca-as-error-minimization" id="toc-deriving-pca-as-error-minimization" class="nav-link" data-scroll-target="#deriving-pca-as-error-minimization"><span class="header-section-number">24.4.2</span> Deriving PCA as Error Minimization</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<ul>
+<li>Discuss the dimensionality of a dataset and strategies for dimensionality reduction</li>
+<li>Derive and carry out the procedure of PCA</li>
+</ul>
+</div>
+</div>
+<p>So far in this course, we’ve focused on <strong>supervised learning</strong> techniques that create a function to map inputs (features) to labelled outputs. Regression and classification are two main examples, where the output value of regression is <em>quantitative</em> while the output value of classification is <em>categorical</em>.</p>
+<p>Today, we’ll introduce an <strong>unsupervised learning</strong> technique called PCA. Unlike supervised learning, unsupervised learning is applied to <em>unlabeled</em> data. Because we have features but no labels, we aim to identify patterns in those features.</p>
+<section id="visualization-revisited" class="level2" data-number="24.1">
+<h2 data-number="24.1" class="anchored" data-anchor-id="visualization-revisited"><span class="header-section-number">24.1</span> Visualization (Revisited)</h2>
+<p>Visualization can help us identify clusters or patterns in our dataset, and it can give us an intuition about our data and how to clean it for the model. For this demo, we’ll return to the MPG dataset from Lecture 19 and see how far we can push visualization for multiple features.</p>
+<div id="0e510d1e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> scipy <span class="im">as</span> sp</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.express <span class="im">as</span> px</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="e261d688" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>mpg <span class="op">=</span> sns.load_dataset(<span class="st">"mpg"</span>).dropna()</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>mpg.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="2">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">mpg</th>
+<th data-quarto-table-cell-role="th">cylinders</th>
+<th data-quarto-table-cell-role="th">displacement</th>
+<th data-quarto-table-cell-role="th">horsepower</th>
+<th data-quarto-table-cell-role="th">weight</th>
+<th data-quarto-table-cell-role="th">acceleration</th>
+<th data-quarto-table-cell-role="th">model_year</th>
+<th data-quarto-table-cell-role="th">origin</th>
+<th data-quarto-table-cell-role="th">name</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>18.0</td>
+<td>8</td>
+<td>307.0</td>
+<td>130.0</td>
+<td>3504</td>
+<td>12.0</td>
+<td>70</td>
+<td>usa</td>
+<td>chevrolet chevelle malibu</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>15.0</td>
+<td>8</td>
+<td>350.0</td>
+<td>165.0</td>
+<td>3693</td>
+<td>11.5</td>
+<td>70</td>
+<td>usa</td>
+<td>buick skylark 320</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>18.0</td>
+<td>8</td>
+<td>318.0</td>
+<td>150.0</td>
+<td>3436</td>
+<td>11.0</td>
+<td>70</td>
+<td>usa</td>
+<td>plymouth satellite</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>16.0</td>
+<td>8</td>
+<td>304.0</td>
+<td>150.0</td>
+<td>3433</td>
+<td>12.0</td>
+<td>70</td>
+<td>usa</td>
+<td>amc rebel sst</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>17.0</td>
+<td>8</td>
+<td>302.0</td>
+<td>140.0</td>
+<td>3449</td>
+<td>10.5</td>
+<td>70</td>
+<td>usa</td>
+<td>ford torino</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>We can plot one feature as a histogram to see it’s distribution. Since we only plot one feature, we consider this a 1-dimensional plot.</p>
+<div id="e7c84a34" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>px.histogram(mpg, x<span class="op">=</span><span class="st">"displacement"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="727e0e86-aaa4-48d5-82ae-3a911d4ca9d0" class="plotly-graph-div" style="height:525px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("727e0e86-aaa4-48d5-82ae-3a911d4ca9d0")) {                    Plotly.newPlot(                        "727e0e86-aaa4-48d5-82ae-3a911d4ca9d0",                        [{"alignmentgroup":"True","bingroup":"x","hovertemplate":"displacement=%{x}\u003cbr\u003ecount=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","marker":{"color":"#636efa","pattern":{"shape":""}},"name":"","offsetgroup":"","orientation":"v","showlegend":false,"x":[307.0,350.0,318.0,304.0,302.0,429.0,454.0,440.0,455.0,390.0,383.0,340.0,400.0,455.0,113.0,198.0,199.0,200.0,97.0,97.0,110.0,107.0,104.0,121.0,199.0,360.0,307.0,318.0,304.0,97.0,140.0,113.0,232.0,225.0,250.0,250.0,232.0,350.0,400.0,351.0,318.0,383.0,400.0,400.0,258.0,140.0,250.0,250.0,122.0,116.0,79.0,88.0,71.0,72.0,97.0,91.0,113.0,97.5,97.0,140.0,122.0,350.0,400.0,318.0,351.0,304.0,429.0,350.0,350.0,400.0,70.0,304.0,307.0,302.0,318.0,121.0,121.0,120.0,96.0,122.0,97.0,120.0,98.0,97.0,350.0,304.0,350.0,302.0,318.0,429.0,400.0,351.0,318.0,440.0,455.0,360.0,225.0,250.0,232.0,250.0,198.0,97.0,400.0,400.0,360.0,350.0,232.0,97.0,140.0,108.0,70.0,122.0,155.0,98.0,350.0,400.0,68.0,116.0,114.0,121.0,318.0,121.0,156.0,350.0,198.0,232.0,250.0,79.0,122.0,71.0,140.0,250.0,258.0,225.0,302.0,350.0,318.0,302.0,304.0,98.0,79.0,97.0,76.0,83.0,90.0,90.0,116.0,120.0,108.0,79.0,225.0,250.0,250.0,250.0,400.0,350.0,318.0,351.0,231.0,250.0,258.0,225.0,231.0,262.0,302.0,97.0,140.0,232.0,140.0,134.0,90.0,119.0,171.0,90.0,232.0,115.0,120.0,121.0,121.0,91.0,107.0,116.0,140.0,98.0,101.0,305.0,318.0,304.0,351.0,225.0,250.0,200.0,232.0,85.0,98.0,90.0,91.0,225.0,250.0,250.0,258.0,97.0,85.0,97.0,140.0,130.0,318.0,120.0,156.0,168.0,350.0,350.0,302.0,318.0,98.0,111.0,79.0,122.0,85.0,305.0,260.0,318.0,302.0,250.0,231.0,225.0,250.0,400.0,350.0,400.0,351.0,97.0,151.0,97.0,140.0,98.0,98.0,97.0,97.0,146.0,121.0,80.0,90.0,98.0,78.0,85.0,91.0,260.0,318.0,302.0,231.0,200.0,200.0,140.0,225.0,232.0,231.0,200.0,225.0,258.0,305.0,231.0,302.0,318.0,98.0,134.0,119.0,105.0,134.0,156.0,151.0,119.0,131.0,163.0,121.0,163.0,89.0,98.0,231.0,200.0,140.0,232.0,225.0,305.0,302.0,351.0,318.0,350.0,351.0,267.0,360.0,89.0,86.0,98.0,121.0,183.0,350.0,141.0,260.0,105.0,105.0,85.0,91.0,151.0,173.0,173.0,151.0,98.0,89.0,98.0,86.0,151.0,140.0,151.0,225.0,97.0,134.0,120.0,119.0,108.0,86.0,156.0,85.0,90.0,90.0,121.0,146.0,91.0,97.0,89.0,168.0,70.0,122.0,107.0,135.0,151.0,156.0,173.0,135.0,79.0,86.0,81.0,97.0,85.0,89.0,91.0,105.0,98.0,98.0,105.0,107.0,108.0,119.0,120.0,141.0,145.0,168.0,146.0,231.0,350.0,200.0,225.0,112.0,112.0,112.0,112.0,135.0,151.0,140.0,105.0,91.0,91.0,105.0,98.0,120.0,107.0,108.0,91.0,91.0,91.0,181.0,262.0,156.0,232.0,144.0,135.0,151.0,140.0,97.0,135.0,120.0,119.0],"xaxis":"x","yaxis":"y","type":"histogram"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"displacement"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"count"}},"legend":{"tracegroupgap":0},"barmode":"relative"},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('727e0e86-aaa4-48d5-82ae-3a911d4ca9d0');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>We can also visualize two features (2-dimensional scatter plot):</p>
+<div id="dcf2560d" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>px.scatter(mpg, x<span class="op">=</span><span class="st">"displacement"</span>, y<span class="op">=</span><span class="st">"horsepower"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="0f4fb84c-8885-4b80-9f34-2ee4931ab681" class="plotly-graph-div" style="height:525px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("0f4fb84c-8885-4b80-9f34-2ee4931ab681")) {                    Plotly.newPlot(                        "0f4fb84c-8885-4b80-9f34-2ee4931ab681",                        [{"hovertemplate":"displacement=%{x}\u003cbr\u003ehorsepower=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","marker":{"color":"#636efa","symbol":"circle"},"mode":"markers","name":"","orientation":"v","showlegend":false,"x":[307.0,350.0,318.0,304.0,302.0,429.0,454.0,440.0,455.0,390.0,383.0,340.0,400.0,455.0,113.0,198.0,199.0,200.0,97.0,97.0,110.0,107.0,104.0,121.0,199.0,360.0,307.0,318.0,304.0,97.0,140.0,113.0,232.0,225.0,250.0,250.0,232.0,350.0,400.0,351.0,318.0,383.0,400.0,400.0,258.0,140.0,250.0,250.0,122.0,116.0,79.0,88.0,71.0,72.0,97.0,91.0,113.0,97.5,97.0,140.0,122.0,350.0,400.0,318.0,351.0,304.0,429.0,350.0,350.0,400.0,70.0,304.0,307.0,302.0,318.0,121.0,121.0,120.0,96.0,122.0,97.0,120.0,98.0,97.0,350.0,304.0,350.0,302.0,318.0,429.0,400.0,351.0,318.0,440.0,455.0,360.0,225.0,250.0,232.0,250.0,198.0,97.0,400.0,400.0,360.0,350.0,232.0,97.0,140.0,108.0,70.0,122.0,155.0,98.0,350.0,400.0,68.0,116.0,114.0,121.0,318.0,121.0,156.0,350.0,198.0,232.0,250.0,79.0,122.0,71.0,140.0,250.0,258.0,225.0,302.0,350.0,318.0,302.0,304.0,98.0,79.0,97.0,76.0,83.0,90.0,90.0,116.0,120.0,108.0,79.0,225.0,250.0,250.0,250.0,400.0,350.0,318.0,351.0,231.0,250.0,258.0,225.0,231.0,262.0,302.0,97.0,140.0,232.0,140.0,134.0,90.0,119.0,171.0,90.0,232.0,115.0,120.0,121.0,121.0,91.0,107.0,116.0,140.0,98.0,101.0,305.0,318.0,304.0,351.0,225.0,250.0,200.0,232.0,85.0,98.0,90.0,91.0,225.0,250.0,250.0,258.0,97.0,85.0,97.0,140.0,130.0,318.0,120.0,156.0,168.0,350.0,350.0,302.0,318.0,98.0,111.0,79.0,122.0,85.0,305.0,260.0,318.0,302.0,250.0,231.0,225.0,250.0,400.0,350.0,400.0,351.0,97.0,151.0,97.0,140.0,98.0,98.0,97.0,97.0,146.0,121.0,80.0,90.0,98.0,78.0,85.0,91.0,260.0,318.0,302.0,231.0,200.0,200.0,140.0,225.0,232.0,231.0,200.0,225.0,258.0,305.0,231.0,302.0,318.0,98.0,134.0,119.0,105.0,134.0,156.0,151.0,119.0,131.0,163.0,121.0,163.0,89.0,98.0,231.0,200.0,140.0,232.0,225.0,305.0,302.0,351.0,318.0,350.0,351.0,267.0,360.0,89.0,86.0,98.0,121.0,183.0,350.0,141.0,260.0,105.0,105.0,85.0,91.0,151.0,173.0,173.0,151.0,98.0,89.0,98.0,86.0,151.0,140.0,151.0,225.0,97.0,134.0,120.0,119.0,108.0,86.0,156.0,85.0,90.0,90.0,121.0,146.0,91.0,97.0,89.0,168.0,70.0,122.0,107.0,135.0,151.0,156.0,173.0,135.0,79.0,86.0,81.0,97.0,85.0,89.0,91.0,105.0,98.0,98.0,105.0,107.0,108.0,119.0,120.0,141.0,145.0,168.0,146.0,231.0,350.0,200.0,225.0,112.0,112.0,112.0,112.0,135.0,151.0,140.0,105.0,91.0,91.0,105.0,98.0,120.0,107.0,108.0,91.0,91.0,91.0,181.0,262.0,156.0,232.0,144.0,135.0,151.0,140.0,97.0,135.0,120.0,119.0],"xaxis":"x","y":[130.0,165.0,150.0,150.0,140.0,198.0,220.0,215.0,225.0,190.0,170.0,160.0,150.0,225.0,95.0,95.0,97.0,85.0,88.0,46.0,87.0,90.0,95.0,113.0,90.0,215.0,200.0,210.0,193.0,88.0,90.0,95.0,100.0,105.0,100.0,88.0,100.0,165.0,175.0,153.0,150.0,180.0,170.0,175.0,110.0,72.0,100.0,88.0,86.0,90.0,70.0,76.0,65.0,69.0,60.0,70.0,95.0,80.0,54.0,90.0,86.0,165.0,175.0,150.0,153.0,150.0,208.0,155.0,160.0,190.0,97.0,150.0,130.0,140.0,150.0,112.0,76.0,87.0,69.0,86.0,92.0,97.0,80.0,88.0,175.0,150.0,145.0,137.0,150.0,198.0,150.0,158.0,150.0,215.0,225.0,175.0,105.0,100.0,100.0,88.0,95.0,46.0,150.0,167.0,170.0,180.0,100.0,88.0,72.0,94.0,90.0,85.0,107.0,90.0,145.0,230.0,49.0,75.0,91.0,112.0,150.0,110.0,122.0,180.0,95.0,100.0,100.0,67.0,80.0,65.0,75.0,100.0,110.0,105.0,140.0,150.0,150.0,140.0,150.0,83.0,67.0,78.0,52.0,61.0,75.0,75.0,75.0,97.0,93.0,67.0,95.0,105.0,72.0,72.0,170.0,145.0,150.0,148.0,110.0,105.0,110.0,95.0,110.0,110.0,129.0,75.0,83.0,100.0,78.0,96.0,71.0,97.0,97.0,70.0,90.0,95.0,88.0,98.0,115.0,53.0,86.0,81.0,92.0,79.0,83.0,140.0,150.0,120.0,152.0,100.0,105.0,81.0,90.0,52.0,60.0,70.0,53.0,100.0,78.0,110.0,95.0,71.0,70.0,75.0,72.0,102.0,150.0,88.0,108.0,120.0,180.0,145.0,130.0,150.0,68.0,80.0,58.0,96.0,70.0,145.0,110.0,145.0,130.0,110.0,105.0,100.0,98.0,180.0,170.0,190.0,149.0,78.0,88.0,75.0,89.0,63.0,83.0,67.0,78.0,97.0,110.0,110.0,48.0,66.0,52.0,70.0,60.0,110.0,140.0,139.0,105.0,95.0,85.0,88.0,100.0,90.0,105.0,85.0,110.0,120.0,145.0,165.0,139.0,140.0,68.0,95.0,97.0,75.0,95.0,105.0,85.0,97.0,103.0,125.0,115.0,133.0,71.0,68.0,115.0,85.0,88.0,90.0,110.0,130.0,129.0,138.0,135.0,155.0,142.0,125.0,150.0,71.0,65.0,80.0,80.0,77.0,125.0,71.0,90.0,70.0,70.0,65.0,69.0,90.0,115.0,115.0,90.0,76.0,60.0,70.0,65.0,90.0,88.0,90.0,90.0,78.0,90.0,75.0,92.0,75.0,65.0,105.0,65.0,48.0,48.0,67.0,67.0,67.0,67.0,62.0,132.0,100.0,88.0,72.0,84.0,84.0,92.0,110.0,84.0,58.0,64.0,60.0,67.0,65.0,62.0,68.0,63.0,65.0,65.0,74.0,75.0,75.0,100.0,74.0,80.0,76.0,116.0,120.0,110.0,105.0,88.0,85.0,88.0,88.0,88.0,85.0,84.0,90.0,92.0,74.0,68.0,68.0,63.0,70.0,88.0,75.0,70.0,67.0,67.0,67.0,110.0,85.0,92.0,112.0,96.0,84.0,90.0,86.0,52.0,84.0,79.0,82.0],"yaxis":"y","type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"displacement"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"horsepower"}},"legend":{"tracegroupgap":0}},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('0f4fb84c-8885-4b80-9f34-2ee4931ab681');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>Three features (3-dimensional scatter plot):</p>
+<div id="704f54e4" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.scatter_3d(mpg, x<span class="op">=</span><span class="st">"displacement"</span>, y<span class="op">=</span><span class="st">"horsepower"</span>, z<span class="op">=</span><span class="st">"weight"</span>,</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>                    width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">800</span>)</span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>fig.update_traces(marker<span class="op">=</span><span class="bu">dict</span>(size<span class="op">=</span><span class="dv">3</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="36fac9a3-ed0a-4793-b4fe-8340e936667d" class="plotly-graph-div" style="height:800px; width:800px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("36fac9a3-ed0a-4793-b4fe-8340e936667d")) {                    Plotly.newPlot(                        "36fac9a3-ed0a-4793-b4fe-8340e936667d",                        [{"hovertemplate":"displacement=%{x}\u003cbr\u003ehorsepower=%{y}\u003cbr\u003eweight=%{z}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","marker":{"color":"#636efa","symbol":"circle","size":3},"mode":"markers","name":"","scene":"scene","showlegend":false,"x":[307.0,350.0,318.0,304.0,302.0,429.0,454.0,440.0,455.0,390.0,383.0,340.0,400.0,455.0,113.0,198.0,199.0,200.0,97.0,97.0,110.0,107.0,104.0,121.0,199.0,360.0,307.0,318.0,304.0,97.0,140.0,113.0,232.0,225.0,250.0,250.0,232.0,350.0,400.0,351.0,318.0,383.0,400.0,400.0,258.0,140.0,250.0,250.0,122.0,116.0,79.0,88.0,71.0,72.0,97.0,91.0,113.0,97.5,97.0,140.0,122.0,350.0,400.0,318.0,351.0,304.0,429.0,350.0,350.0,400.0,70.0,304.0,307.0,302.0,318.0,121.0,121.0,120.0,96.0,122.0,97.0,120.0,98.0,97.0,350.0,304.0,350.0,302.0,318.0,429.0,400.0,351.0,318.0,440.0,455.0,360.0,225.0,250.0,232.0,250.0,198.0,97.0,400.0,400.0,360.0,350.0,232.0,97.0,140.0,108.0,70.0,122.0,155.0,98.0,350.0,400.0,68.0,116.0,114.0,121.0,318.0,121.0,156.0,350.0,198.0,232.0,250.0,79.0,122.0,71.0,140.0,250.0,258.0,225.0,302.0,350.0,318.0,302.0,304.0,98.0,79.0,97.0,76.0,83.0,90.0,90.0,116.0,120.0,108.0,79.0,225.0,250.0,250.0,250.0,400.0,350.0,318.0,351.0,231.0,250.0,258.0,225.0,231.0,262.0,302.0,97.0,140.0,232.0,140.0,134.0,90.0,119.0,171.0,90.0,232.0,115.0,120.0,121.0,121.0,91.0,107.0,116.0,140.0,98.0,101.0,305.0,318.0,304.0,351.0,225.0,250.0,200.0,232.0,85.0,98.0,90.0,91.0,225.0,250.0,250.0,258.0,97.0,85.0,97.0,140.0,130.0,318.0,120.0,156.0,168.0,350.0,350.0,302.0,318.0,98.0,111.0,79.0,122.0,85.0,305.0,260.0,318.0,302.0,250.0,231.0,225.0,250.0,400.0,350.0,400.0,351.0,97.0,151.0,97.0,140.0,98.0,98.0,97.0,97.0,146.0,121.0,80.0,90.0,98.0,78.0,85.0,91.0,260.0,318.0,302.0,231.0,200.0,200.0,140.0,225.0,232.0,231.0,200.0,225.0,258.0,305.0,231.0,302.0,318.0,98.0,134.0,119.0,105.0,134.0,156.0,151.0,119.0,131.0,163.0,121.0,163.0,89.0,98.0,231.0,200.0,140.0,232.0,225.0,305.0,302.0,351.0,318.0,350.0,351.0,267.0,360.0,89.0,86.0,98.0,121.0,183.0,350.0,141.0,260.0,105.0,105.0,85.0,91.0,151.0,173.0,173.0,151.0,98.0,89.0,98.0,86.0,151.0,140.0,151.0,225.0,97.0,134.0,120.0,119.0,108.0,86.0,156.0,85.0,90.0,90.0,121.0,146.0,91.0,97.0,89.0,168.0,70.0,122.0,107.0,135.0,151.0,156.0,173.0,135.0,79.0,86.0,81.0,97.0,85.0,89.0,91.0,105.0,98.0,98.0,105.0,107.0,108.0,119.0,120.0,141.0,145.0,168.0,146.0,231.0,350.0,200.0,225.0,112.0,112.0,112.0,112.0,135.0,151.0,140.0,105.0,91.0,91.0,105.0,98.0,120.0,107.0,108.0,91.0,91.0,91.0,181.0,262.0,156.0,232.0,144.0,135.0,151.0,140.0,97.0,135.0,120.0,119.0],"y":[130.0,165.0,150.0,150.0,140.0,198.0,220.0,215.0,225.0,190.0,170.0,160.0,150.0,225.0,95.0,95.0,97.0,85.0,88.0,46.0,87.0,90.0,95.0,113.0,90.0,215.0,200.0,210.0,193.0,88.0,90.0,95.0,100.0,105.0,100.0,88.0,100.0,165.0,175.0,153.0,150.0,180.0,170.0,175.0,110.0,72.0,100.0,88.0,86.0,90.0,70.0,76.0,65.0,69.0,60.0,70.0,95.0,80.0,54.0,90.0,86.0,165.0,175.0,150.0,153.0,150.0,208.0,155.0,160.0,190.0,97.0,150.0,130.0,140.0,150.0,112.0,76.0,87.0,69.0,86.0,92.0,97.0,80.0,88.0,175.0,150.0,145.0,137.0,150.0,198.0,150.0,158.0,150.0,215.0,225.0,175.0,105.0,100.0,100.0,88.0,95.0,46.0,150.0,167.0,170.0,180.0,100.0,88.0,72.0,94.0,90.0,85.0,107.0,90.0,145.0,230.0,49.0,75.0,91.0,112.0,150.0,110.0,122.0,180.0,95.0,100.0,100.0,67.0,80.0,65.0,75.0,100.0,110.0,105.0,140.0,150.0,150.0,140.0,150.0,83.0,67.0,78.0,52.0,61.0,75.0,75.0,75.0,97.0,93.0,67.0,95.0,105.0,72.0,72.0,170.0,145.0,150.0,148.0,110.0,105.0,110.0,95.0,110.0,110.0,129.0,75.0,83.0,100.0,78.0,96.0,71.0,97.0,97.0,70.0,90.0,95.0,88.0,98.0,115.0,53.0,86.0,81.0,92.0,79.0,83.0,140.0,150.0,120.0,152.0,100.0,105.0,81.0,90.0,52.0,60.0,70.0,53.0,100.0,78.0,110.0,95.0,71.0,70.0,75.0,72.0,102.0,150.0,88.0,108.0,120.0,180.0,145.0,130.0,150.0,68.0,80.0,58.0,96.0,70.0,145.0,110.0,145.0,130.0,110.0,105.0,100.0,98.0,180.0,170.0,190.0,149.0,78.0,88.0,75.0,89.0,63.0,83.0,67.0,78.0,97.0,110.0,110.0,48.0,66.0,52.0,70.0,60.0,110.0,140.0,139.0,105.0,95.0,85.0,88.0,100.0,90.0,105.0,85.0,110.0,120.0,145.0,165.0,139.0,140.0,68.0,95.0,97.0,75.0,95.0,105.0,85.0,97.0,103.0,125.0,115.0,133.0,71.0,68.0,115.0,85.0,88.0,90.0,110.0,130.0,129.0,138.0,135.0,155.0,142.0,125.0,150.0,71.0,65.0,80.0,80.0,77.0,125.0,71.0,90.0,70.0,70.0,65.0,69.0,90.0,115.0,115.0,90.0,76.0,60.0,70.0,65.0,90.0,88.0,90.0,90.0,78.0,90.0,75.0,92.0,75.0,65.0,105.0,65.0,48.0,48.0,67.0,67.0,67.0,67.0,62.0,132.0,100.0,88.0,72.0,84.0,84.0,92.0,110.0,84.0,58.0,64.0,60.0,67.0,65.0,62.0,68.0,63.0,65.0,65.0,74.0,75.0,75.0,100.0,74.0,80.0,76.0,116.0,120.0,110.0,105.0,88.0,85.0,88.0,88.0,88.0,85.0,84.0,90.0,92.0,74.0,68.0,68.0,63.0,70.0,88.0,75.0,70.0,67.0,67.0,67.0,110.0,85.0,92.0,112.0,96.0,84.0,90.0,86.0,52.0,84.0,79.0,82.0],"z":[3504,3693,3436,3433,3449,4341,4354,4312,4425,3850,3563,3609,3761,3086,2372,2833,2774,2587,2130,1835,2672,2430,2375,2234,2648,4615,4376,4382,4732,2130,2264,2228,2634,3439,3329,3302,3288,4209,4464,4154,4096,4955,4746,5140,2962,2408,3282,3139,2220,2123,2074,2065,1773,1613,1834,1955,2278,2126,2254,2408,2226,4274,4385,4135,4129,3672,4633,4502,4456,4422,2330,3892,4098,4294,4077,2933,2511,2979,2189,2395,2288,2506,2164,2100,4100,3672,3988,4042,3777,4952,4464,4363,4237,4735,4951,3821,3121,3278,2945,3021,2904,1950,4997,4906,4654,4499,2789,2279,2401,2379,2124,2310,2472,2265,4082,4278,1867,2158,2582,2868,3399,2660,2807,3664,3102,2901,3336,1950,2451,1836,2542,3781,3632,3613,4141,4699,4457,4638,4257,2219,1963,2300,1649,2003,2125,2108,2246,2489,2391,2000,3264,3459,3432,3158,4668,4440,4498,4657,3907,3897,3730,3785,3039,3221,3169,2171,2639,2914,2592,2702,2223,2545,2984,1937,3211,2694,2957,2945,2671,1795,2464,2220,2572,2255,2202,4215,4190,3962,4215,3233,3353,3012,3085,2035,2164,1937,1795,3651,3574,3645,3193,1825,1990,2155,2565,3150,3940,3270,2930,3820,4380,4055,3870,3755,2045,2155,1825,2300,1945,3880,4060,4140,4295,3520,3425,3630,3525,4220,4165,4325,4335,1940,2740,2265,2755,2051,2075,1985,2190,2815,2600,2720,1985,1800,1985,2070,1800,3365,3735,3570,3535,3155,2965,2720,3430,3210,3380,3070,3620,3410,3425,3445,3205,4080,2155,2560,2300,2230,2515,2745,2855,2405,2830,3140,2795,3410,1990,2135,3245,2990,2890,3265,3360,3840,3725,3955,3830,4360,4054,3605,3940,1925,1975,1915,2670,3530,3900,3190,3420,2200,2150,2020,2130,2670,2595,2700,2556,2144,1968,2120,2019,2678,2870,3003,3381,2188,2711,2542,2434,2265,2110,2800,2110,2085,2335,2950,3250,1850,2145,1845,2910,2420,2500,2290,2490,2635,2620,2725,2385,1755,1875,1760,2065,1975,2050,1985,2215,2045,2380,2190,2210,2350,2615,2635,3230,3160,2900,2930,3415,3725,3060,3465,2605,2640,2395,2575,2525,2735,2865,1980,2025,1970,2125,2125,2160,2205,2245,1965,1965,1995,2945,3015,2585,2835,2665,2370,2950,2790,2130,2295,2625,2720],"type":"scatter3d"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"scene":{"domain":{"x":[0.0,1.0],"y":[0.0,1.0]},"xaxis":{"title":{"text":"displacement"}},"yaxis":{"title":{"text":"horsepower"}},"zaxis":{"title":{"text":"weight"}}},"legend":{"tracegroupgap":0},"height":800,"width":800},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('36fac9a3-ed0a-4793-b4fe-8340e936667d');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>We can even push to 4 features using a 3D scatter plot and a colorbar:</p>
+<div id="fd821cec" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="6">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.scatter_3d(mpg, x<span class="op">=</span><span class="st">"displacement"</span>, </span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>                    y<span class="op">=</span><span class="st">"horsepower"</span>, </span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>                    z<span class="op">=</span><span class="st">"weight"</span>, </span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>                    color<span class="op">=</span><span class="st">"model_year"</span>,</span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>                    width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">800</span>, </span>
+<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>                    opacity<span class="op">=</span><span class="fl">.7</span>)</span>
+<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>fig.update_traces(marker<span class="op">=</span><span class="bu">dict</span>(size<span class="op">=</span><span class="dv">5</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="550315f8-5808-4273-9994-7606fb5f3e7b" class="plotly-graph-div" style="height:800px; width:800px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("550315f8-5808-4273-9994-7606fb5f3e7b")) {                    Plotly.newPlot(                        "550315f8-5808-4273-9994-7606fb5f3e7b",                        [{"hovertemplate":"displacement=%{x}\u003cbr\u003ehorsepower=%{y}\u003cbr\u003eweight=%{z}\u003cbr\u003emodel_year=%{marker.color}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","marker":{"color":[70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82],"coloraxis":"coloraxis","opacity":0.7,"symbol":"circle","size":5},"mode":"markers","name":"","scene":"scene","showlegend":false,"x":[307.0,350.0,318.0,304.0,302.0,429.0,454.0,440.0,455.0,390.0,383.0,340.0,400.0,455.0,113.0,198.0,199.0,200.0,97.0,97.0,110.0,107.0,104.0,121.0,199.0,360.0,307.0,318.0,304.0,97.0,140.0,113.0,232.0,225.0,250.0,250.0,232.0,350.0,400.0,351.0,318.0,383.0,400.0,400.0,258.0,140.0,250.0,250.0,122.0,116.0,79.0,88.0,71.0,72.0,97.0,91.0,113.0,97.5,97.0,140.0,122.0,350.0,400.0,318.0,351.0,304.0,429.0,350.0,350.0,400.0,70.0,304.0,307.0,302.0,318.0,121.0,121.0,120.0,96.0,122.0,97.0,120.0,98.0,97.0,350.0,304.0,350.0,302.0,318.0,429.0,400.0,351.0,318.0,440.0,455.0,360.0,225.0,250.0,232.0,250.0,198.0,97.0,400.0,400.0,360.0,350.0,232.0,97.0,140.0,108.0,70.0,122.0,155.0,98.0,350.0,400.0,68.0,116.0,114.0,121.0,318.0,121.0,156.0,350.0,198.0,232.0,250.0,79.0,122.0,71.0,140.0,250.0,258.0,225.0,302.0,350.0,318.0,302.0,304.0,98.0,79.0,97.0,76.0,83.0,90.0,90.0,116.0,120.0,108.0,79.0,225.0,250.0,250.0,250.0,400.0,350.0,318.0,351.0,231.0,250.0,258.0,225.0,231.0,262.0,302.0,97.0,140.0,232.0,140.0,134.0,90.0,119.0,171.0,90.0,232.0,115.0,120.0,121.0,121.0,91.0,107.0,116.0,140.0,98.0,101.0,305.0,318.0,304.0,351.0,225.0,250.0,200.0,232.0,85.0,98.0,90.0,91.0,225.0,250.0,250.0,258.0,97.0,85.0,97.0,140.0,130.0,318.0,120.0,156.0,168.0,350.0,350.0,302.0,318.0,98.0,111.0,79.0,122.0,85.0,305.0,260.0,318.0,302.0,250.0,231.0,225.0,250.0,400.0,350.0,400.0,351.0,97.0,151.0,97.0,140.0,98.0,98.0,97.0,97.0,146.0,121.0,80.0,90.0,98.0,78.0,85.0,91.0,260.0,318.0,302.0,231.0,200.0,200.0,140.0,225.0,232.0,231.0,200.0,225.0,258.0,305.0,231.0,302.0,318.0,98.0,134.0,119.0,105.0,134.0,156.0,151.0,119.0,131.0,163.0,121.0,163.0,89.0,98.0,231.0,200.0,140.0,232.0,225.0,305.0,302.0,351.0,318.0,350.0,351.0,267.0,360.0,89.0,86.0,98.0,121.0,183.0,350.0,141.0,260.0,105.0,105.0,85.0,91.0,151.0,173.0,173.0,151.0,98.0,89.0,98.0,86.0,151.0,140.0,151.0,225.0,97.0,134.0,120.0,119.0,108.0,86.0,156.0,85.0,90.0,90.0,121.0,146.0,91.0,97.0,89.0,168.0,70.0,122.0,107.0,135.0,151.0,156.0,173.0,135.0,79.0,86.0,81.0,97.0,85.0,89.0,91.0,105.0,98.0,98.0,105.0,107.0,108.0,119.0,120.0,141.0,145.0,168.0,146.0,231.0,350.0,200.0,225.0,112.0,112.0,112.0,112.0,135.0,151.0,140.0,105.0,91.0,91.0,105.0,98.0,120.0,107.0,108.0,91.0,91.0,91.0,181.0,262.0,156.0,232.0,144.0,135.0,151.0,140.0,97.0,135.0,120.0,119.0],"y":[130.0,165.0,150.0,150.0,140.0,198.0,220.0,215.0,225.0,190.0,170.0,160.0,150.0,225.0,95.0,95.0,97.0,85.0,88.0,46.0,87.0,90.0,95.0,113.0,90.0,215.0,200.0,210.0,193.0,88.0,90.0,95.0,100.0,105.0,100.0,88.0,100.0,165.0,175.0,153.0,150.0,180.0,170.0,175.0,110.0,72.0,100.0,88.0,86.0,90.0,70.0,76.0,65.0,69.0,60.0,70.0,95.0,80.0,54.0,90.0,86.0,165.0,175.0,150.0,153.0,150.0,208.0,155.0,160.0,190.0,97.0,150.0,130.0,140.0,150.0,112.0,76.0,87.0,69.0,86.0,92.0,97.0,80.0,88.0,175.0,150.0,145.0,137.0,150.0,198.0,150.0,158.0,150.0,215.0,225.0,175.0,105.0,100.0,100.0,88.0,95.0,46.0,150.0,167.0,170.0,180.0,100.0,88.0,72.0,94.0,90.0,85.0,107.0,90.0,145.0,230.0,49.0,75.0,91.0,112.0,150.0,110.0,122.0,180.0,95.0,100.0,100.0,67.0,80.0,65.0,75.0,100.0,110.0,105.0,140.0,150.0,150.0,140.0,150.0,83.0,67.0,78.0,52.0,61.0,75.0,75.0,75.0,97.0,93.0,67.0,95.0,105.0,72.0,72.0,170.0,145.0,150.0,148.0,110.0,105.0,110.0,95.0,110.0,110.0,129.0,75.0,83.0,100.0,78.0,96.0,71.0,97.0,97.0,70.0,90.0,95.0,88.0,98.0,115.0,53.0,86.0,81.0,92.0,79.0,83.0,140.0,150.0,120.0,152.0,100.0,105.0,81.0,90.0,52.0,60.0,70.0,53.0,100.0,78.0,110.0,95.0,71.0,70.0,75.0,72.0,102.0,150.0,88.0,108.0,120.0,180.0,145.0,130.0,150.0,68.0,80.0,58.0,96.0,70.0,145.0,110.0,145.0,130.0,110.0,105.0,100.0,98.0,180.0,170.0,190.0,149.0,78.0,88.0,75.0,89.0,63.0,83.0,67.0,78.0,97.0,110.0,110.0,48.0,66.0,52.0,70.0,60.0,110.0,140.0,139.0,105.0,95.0,85.0,88.0,100.0,90.0,105.0,85.0,110.0,120.0,145.0,165.0,139.0,140.0,68.0,95.0,97.0,75.0,95.0,105.0,85.0,97.0,103.0,125.0,115.0,133.0,71.0,68.0,115.0,85.0,88.0,90.0,110.0,130.0,129.0,138.0,135.0,155.0,142.0,125.0,150.0,71.0,65.0,80.0,80.0,77.0,125.0,71.0,90.0,70.0,70.0,65.0,69.0,90.0,115.0,115.0,90.0,76.0,60.0,70.0,65.0,90.0,88.0,90.0,90.0,78.0,90.0,75.0,92.0,75.0,65.0,105.0,65.0,48.0,48.0,67.0,67.0,67.0,67.0,62.0,132.0,100.0,88.0,72.0,84.0,84.0,92.0,110.0,84.0,58.0,64.0,60.0,67.0,65.0,62.0,68.0,63.0,65.0,65.0,74.0,75.0,75.0,100.0,74.0,80.0,76.0,116.0,120.0,110.0,105.0,88.0,85.0,88.0,88.0,88.0,85.0,84.0,90.0,92.0,74.0,68.0,68.0,63.0,70.0,88.0,75.0,70.0,67.0,67.0,67.0,110.0,85.0,92.0,112.0,96.0,84.0,90.0,86.0,52.0,84.0,79.0,82.0],"z":[3504,3693,3436,3433,3449,4341,4354,4312,4425,3850,3563,3609,3761,3086,2372,2833,2774,2587,2130,1835,2672,2430,2375,2234,2648,4615,4376,4382,4732,2130,2264,2228,2634,3439,3329,3302,3288,4209,4464,4154,4096,4955,4746,5140,2962,2408,3282,3139,2220,2123,2074,2065,1773,1613,1834,1955,2278,2126,2254,2408,2226,4274,4385,4135,4129,3672,4633,4502,4456,4422,2330,3892,4098,4294,4077,2933,2511,2979,2189,2395,2288,2506,2164,2100,4100,3672,3988,4042,3777,4952,4464,4363,4237,4735,4951,3821,3121,3278,2945,3021,2904,1950,4997,4906,4654,4499,2789,2279,2401,2379,2124,2310,2472,2265,4082,4278,1867,2158,2582,2868,3399,2660,2807,3664,3102,2901,3336,1950,2451,1836,2542,3781,3632,3613,4141,4699,4457,4638,4257,2219,1963,2300,1649,2003,2125,2108,2246,2489,2391,2000,3264,3459,3432,3158,4668,4440,4498,4657,3907,3897,3730,3785,3039,3221,3169,2171,2639,2914,2592,2702,2223,2545,2984,1937,3211,2694,2957,2945,2671,1795,2464,2220,2572,2255,2202,4215,4190,3962,4215,3233,3353,3012,3085,2035,2164,1937,1795,3651,3574,3645,3193,1825,1990,2155,2565,3150,3940,3270,2930,3820,4380,4055,3870,3755,2045,2155,1825,2300,1945,3880,4060,4140,4295,3520,3425,3630,3525,4220,4165,4325,4335,1940,2740,2265,2755,2051,2075,1985,2190,2815,2600,2720,1985,1800,1985,2070,1800,3365,3735,3570,3535,3155,2965,2720,3430,3210,3380,3070,3620,3410,3425,3445,3205,4080,2155,2560,2300,2230,2515,2745,2855,2405,2830,3140,2795,3410,1990,2135,3245,2990,2890,3265,3360,3840,3725,3955,3830,4360,4054,3605,3940,1925,1975,1915,2670,3530,3900,3190,3420,2200,2150,2020,2130,2670,2595,2700,2556,2144,1968,2120,2019,2678,2870,3003,3381,2188,2711,2542,2434,2265,2110,2800,2110,2085,2335,2950,3250,1850,2145,1845,2910,2420,2500,2290,2490,2635,2620,2725,2385,1755,1875,1760,2065,1975,2050,1985,2215,2045,2380,2190,2210,2350,2615,2635,3230,3160,2900,2930,3415,3725,3060,3465,2605,2640,2395,2575,2525,2735,2865,1980,2025,1970,2125,2125,2160,2205,2245,1965,1965,1995,2945,3015,2585,2835,2665,2370,2950,2790,2130,2295,2625,2720],"type":"scatter3d"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"scene":{"domain":{"x":[0.0,1.0],"y":[0.0,1.0]},"xaxis":{"title":{"text":"displacement"}},"yaxis":{"title":{"text":"horsepower"}},"zaxis":{"title":{"text":"weight"}}},"coloraxis":{"colorbar":{"title":{"text":"model_year"}},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]},"legend":{"tracegroupgap":0},"height":800,"width":800},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('550315f8-5808-4273-9994-7606fb5f3e7b');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>Visualizing 5 features is also possible if we make the scatter dots unique to the datapoint’s origin.</p>
+<div id="39f1bb2b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="7">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.scatter_3d(mpg, x<span class="op">=</span><span class="st">"displacement"</span>, </span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>                    y<span class="op">=</span><span class="st">"horsepower"</span>, </span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>                    z<span class="op">=</span><span class="st">"weight"</span>, </span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>                    color<span class="op">=</span><span class="st">"model_year"</span>,</span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>                    size<span class="op">=</span><span class="st">"mpg"</span>,</span>
+<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>                    symbol<span class="op">=</span><span class="st">"origin"</span>,</span>
+<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>                    width<span class="op">=</span><span class="dv">900</span>, height<span class="op">=</span><span class="dv">800</span>, </span>
+<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>                    opacity<span class="op">=</span><span class="fl">.7</span>)</span>
+<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a><span class="co"># hide color scale legend on the plotly fig</span></span>
+<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>fig.update_layout(coloraxis_showscale<span class="op">=</span><span class="va">False</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="b2e9a25e-0ae6-4d54-b2c2-84cdf073593e" class="plotly-graph-div" style="height:800px; width:900px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("b2e9a25e-0ae6-4d54-b2c2-84cdf073593e")) {                    Plotly.newPlot(                        "b2e9a25e-0ae6-4d54-b2c2-84cdf073593e",                        [{"hovertemplate":"origin=usa\u003cbr\u003edisplacement=%{x}\u003cbr\u003ehorsepower=%{y}\u003cbr\u003eweight=%{z}\u003cbr\u003empg=%{marker.size}\u003cbr\u003emodel_year=%{marker.color}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"usa","marker":{"color":[70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,72,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,73,74,74,74,74,74,74,74,74,74,74,74,74,74,74,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,75,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,76,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,80,80,80,80,80,80,81,81,81,81,81,81,81,81,81,81,81,81,81,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82],"coloraxis":"coloraxis","opacity":0.7,"size":[18.0,15.0,18.0,16.0,17.0,15.0,14.0,14.0,14.0,15.0,15.0,14.0,15.0,14.0,22.0,18.0,21.0,21.0,10.0,10.0,11.0,9.0,28.0,19.0,16.0,17.0,19.0,18.0,14.0,14.0,14.0,14.0,12.0,13.0,13.0,18.0,22.0,19.0,18.0,23.0,26.0,25.0,20.0,21.0,13.0,14.0,15.0,14.0,17.0,11.0,13.0,12.0,13.0,15.0,13.0,13.0,14.0,22.0,28.0,13.0,14.0,13.0,14.0,15.0,12.0,13.0,13.0,14.0,13.0,12.0,13.0,18.0,16.0,18.0,18.0,23.0,11.0,12.0,13.0,12.0,18.0,21.0,19.0,21.0,15.0,16.0,15.0,11.0,20.0,19.0,15.0,26.0,25.0,16.0,16.0,18.0,16.0,13.0,14.0,14.0,14.0,28.0,19.0,18.0,15.0,15.0,16.0,15.0,16.0,14.0,17.0,16.0,15.0,18.0,21.0,20.0,13.0,23.0,20.0,23.0,18.0,19.0,25.0,26.0,17.5,16.0,15.5,14.5,22.0,22.0,24.0,22.5,29.0,24.5,20.0,18.0,18.5,17.5,26.5,13.0,16.5,13.0,13.0,13.0,30.0,25.5,17.5,17.0,15.5,15.0,17.5,20.5,19.0,18.5,16.0,15.5,15.5,16.0,24.5,25.5,30.5,33.5,36.1,19.9,19.4,20.2,19.2,20.5,20.2,25.1,20.5,19.4,20.6,20.8,18.6,18.1,19.2,17.7,18.1,17.5,30.0,30.9,23.2,23.8,21.5,19.8,22.3,20.2,20.6,17.0,17.6,16.5,18.2,16.9,15.5,19.2,18.5,35.7,27.4,23.0,23.9,34.2,34.5,28.4,28.8,26.8,33.5,32.1,28.0,26.4,24.3,19.1,27.9,27.2,26.6,25.8,23.5,30.0,39.0,34.7,34.4,29.9,22.4,26.6,20.2,17.6,28.0,27.0,34.0,31.0,29.0,27.0,24.0,38.0,36.0,25.0,38.0,26.0,22.0,36.0,27.0,27.0,32.0,28.0,31.0],"sizemode":"area","sizeref":0.1165,"symbol":"circle"},"mode":"markers","name":"usa","scene":"scene","showlegend":true,"x":[307.0,350.0,318.0,304.0,302.0,429.0,454.0,440.0,455.0,390.0,383.0,340.0,400.0,455.0,198.0,199.0,200.0,199.0,360.0,307.0,318.0,304.0,140.0,232.0,225.0,250.0,250.0,232.0,350.0,400.0,351.0,318.0,383.0,400.0,400.0,258.0,140.0,250.0,250.0,122.0,91.0,97.5,140.0,122.0,350.0,400.0,318.0,351.0,304.0,429.0,350.0,350.0,400.0,304.0,307.0,302.0,318.0,122.0,98.0,350.0,304.0,350.0,302.0,318.0,429.0,400.0,351.0,318.0,440.0,455.0,360.0,225.0,250.0,232.0,250.0,198.0,400.0,400.0,360.0,350.0,232.0,140.0,122.0,155.0,350.0,400.0,318.0,350.0,198.0,232.0,250.0,122.0,140.0,250.0,258.0,225.0,302.0,350.0,318.0,302.0,304.0,90.0,225.0,250.0,250.0,250.0,400.0,350.0,318.0,351.0,231.0,250.0,258.0,225.0,231.0,262.0,302.0,140.0,232.0,140.0,171.0,232.0,140.0,98.0,305.0,318.0,304.0,351.0,225.0,250.0,200.0,232.0,85.0,98.0,225.0,250.0,250.0,258.0,140.0,318.0,350.0,350.0,302.0,318.0,111.0,122.0,305.0,260.0,318.0,302.0,250.0,231.0,225.0,250.0,400.0,350.0,400.0,351.0,151.0,140.0,98.0,98.0,98.0,260.0,318.0,302.0,231.0,200.0,200.0,140.0,225.0,232.0,231.0,200.0,225.0,258.0,305.0,231.0,302.0,318.0,98.0,105.0,156.0,151.0,231.0,200.0,140.0,232.0,225.0,305.0,302.0,351.0,318.0,350.0,351.0,267.0,360.0,98.0,121.0,350.0,260.0,105.0,105.0,151.0,173.0,173.0,151.0,98.0,151.0,140.0,151.0,225.0,156.0,135.0,151.0,156.0,173.0,135.0,86.0,105.0,98.0,98.0,231.0,350.0,200.0,225.0,112.0,112.0,112.0,112.0,135.0,151.0,140.0,105.0,98.0,181.0,262.0,156.0,232.0,135.0,151.0,140.0,135.0,120.0,119.0],"y":[130.0,165.0,150.0,150.0,140.0,198.0,220.0,215.0,225.0,190.0,170.0,160.0,150.0,225.0,95.0,97.0,85.0,90.0,215.0,200.0,210.0,193.0,90.0,100.0,105.0,100.0,88.0,100.0,165.0,175.0,153.0,150.0,180.0,170.0,175.0,110.0,72.0,100.0,88.0,86.0,70.0,80.0,90.0,86.0,165.0,175.0,150.0,153.0,150.0,208.0,155.0,160.0,190.0,150.0,130.0,140.0,150.0,86.0,80.0,175.0,150.0,145.0,137.0,150.0,198.0,150.0,158.0,150.0,215.0,225.0,175.0,105.0,100.0,100.0,88.0,95.0,150.0,167.0,170.0,180.0,100.0,72.0,85.0,107.0,145.0,230.0,150.0,180.0,95.0,100.0,100.0,80.0,75.0,100.0,110.0,105.0,140.0,150.0,150.0,140.0,150.0,75.0,95.0,105.0,72.0,72.0,170.0,145.0,150.0,148.0,110.0,105.0,110.0,95.0,110.0,110.0,129.0,83.0,100.0,78.0,97.0,90.0,92.0,79.0,140.0,150.0,120.0,152.0,100.0,105.0,81.0,90.0,52.0,60.0,100.0,78.0,110.0,95.0,72.0,150.0,180.0,145.0,130.0,150.0,80.0,96.0,145.0,110.0,145.0,130.0,110.0,105.0,100.0,98.0,180.0,170.0,190.0,149.0,88.0,89.0,63.0,83.0,66.0,110.0,140.0,139.0,105.0,95.0,85.0,88.0,100.0,90.0,105.0,85.0,110.0,120.0,145.0,165.0,139.0,140.0,68.0,75.0,105.0,85.0,115.0,85.0,88.0,90.0,110.0,130.0,129.0,138.0,135.0,155.0,142.0,125.0,150.0,80.0,80.0,125.0,90.0,70.0,70.0,90.0,115.0,115.0,90.0,70.0,90.0,88.0,90.0,90.0,105.0,84.0,84.0,92.0,110.0,84.0,64.0,63.0,65.0,65.0,110.0,105.0,88.0,85.0,88.0,88.0,88.0,85.0,84.0,90.0,92.0,63.0,70.0,110.0,85.0,92.0,112.0,84.0,90.0,86.0,84.0,79.0,82.0],"z":[3504,3693,3436,3433,3449,4341,4354,4312,4425,3850,3563,3609,3761,3086,2833,2774,2587,2648,4615,4376,4382,4732,2264,2634,3439,3329,3302,3288,4209,4464,4154,4096,4955,4746,5140,2962,2408,3282,3139,2220,1955,2126,2408,2226,4274,4385,4135,4129,3672,4633,4502,4456,4422,3892,4098,4294,4077,2395,2164,4100,3672,3988,4042,3777,4952,4464,4363,4237,4735,4951,3821,3121,3278,2945,3021,2904,4997,4906,4654,4499,2789,2401,2310,2472,4082,4278,3399,3664,3102,2901,3336,2451,2542,3781,3632,3613,4141,4699,4457,4638,4257,2125,3264,3459,3432,3158,4668,4440,4498,4657,3907,3897,3730,3785,3039,3221,3169,2639,2914,2592,2984,3211,2572,2255,4215,4190,3962,4215,3233,3353,3012,3085,2035,2164,3651,3574,3645,3193,2565,3940,4380,4055,3870,3755,2155,2300,3880,4060,4140,4295,3520,3425,3630,3525,4220,4165,4325,4335,2740,2755,2051,2075,1800,3365,3735,3570,3535,3155,2965,2720,3430,3210,3380,3070,3620,3410,3425,3445,3205,4080,2155,2230,2745,2855,3245,2990,2890,3265,3360,3840,3725,3955,3830,4360,4054,3605,3940,1915,2670,3900,3420,2200,2150,2670,2595,2700,2556,2120,2678,2870,3003,3381,2800,2490,2635,2620,2725,2385,1875,2215,2045,2380,3415,3725,3060,3465,2605,2640,2395,2575,2525,2735,2865,2125,2125,2945,3015,2585,2835,2370,2950,2790,2295,2625,2720],"type":"scatter3d"},{"hovertemplate":"origin=japan\u003cbr\u003edisplacement=%{x}\u003cbr\u003ehorsepower=%{y}\u003cbr\u003eweight=%{z}\u003cbr\u003empg=%{marker.size}\u003cbr\u003emodel_year=%{marker.color}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"japan","marker":{"color":[70,70,71,71,71,71,72,72,72,72,72,73,73,73,73,74,74,74,74,74,74,75,75,75,75,76,76,76,76,77,77,77,77,77,77,78,78,78,78,78,78,78,78,79,79,80,80,80,80,80,80,80,80,80,80,80,80,80,81,81,81,81,81,81,81,81,81,81,81,81,82,82,82,82,82,82,82,82,82],"coloraxis":"coloraxis","opacity":0.7,"size":[24.0,27.0,27.0,25.0,31.0,35.0,24.0,19.0,28.0,23.0,27.0,20.0,22.0,18.0,20.0,31.0,32.0,31.0,32.0,24.0,26.0,29.0,24.0,24.0,33.0,33.0,32.0,28.0,19.0,31.5,33.5,26.0,30.0,22.0,21.5,32.8,39.4,36.1,27.5,27.2,21.1,23.9,29.5,34.1,31.8,38.1,37.2,29.8,31.3,37.0,32.2,46.6,40.8,44.6,33.8,32.7,23.7,32.4,39.1,35.1,32.3,37.0,37.7,34.1,33.7,32.4,32.9,31.6,25.4,24.2,37.0,31.0,36.0,36.0,34.0,38.0,32.0,38.0,32.0],"sizemode":"area","sizeref":0.1165,"symbol":"diamond"},"mode":"markers","name":"japan","scene":"scene","showlegend":true,"x":[113.0,97.0,97.0,113.0,71.0,72.0,113.0,70.0,97.0,120.0,97.0,97.0,108.0,70.0,156.0,79.0,71.0,76.0,83.0,120.0,108.0,97.0,134.0,119.0,91.0,91.0,85.0,97.0,156.0,98.0,85.0,97.0,97.0,146.0,80.0,78.0,85.0,91.0,134.0,119.0,134.0,119.0,98.0,86.0,85.0,89.0,86.0,134.0,120.0,119.0,108.0,86.0,85.0,91.0,97.0,168.0,70.0,107.0,79.0,81.0,97.0,85.0,89.0,91.0,107.0,108.0,119.0,120.0,168.0,146.0,91.0,91.0,120.0,107.0,108.0,91.0,91.0,91.0,144.0],"y":[95.0,88.0,88.0,95.0,65.0,69.0,95.0,97.0,92.0,97.0,88.0,88.0,94.0,90.0,122.0,67.0,65.0,52.0,61.0,97.0,93.0,75.0,96.0,97.0,53.0,53.0,70.0,75.0,108.0,68.0,70.0,75.0,67.0,97.0,110.0,52.0,70.0,60.0,95.0,97.0,95.0,97.0,68.0,65.0,65.0,60.0,65.0,90.0,75.0,92.0,75.0,65.0,65.0,67.0,67.0,132.0,100.0,72.0,58.0,60.0,67.0,65.0,62.0,68.0,75.0,75.0,100.0,74.0,116.0,120.0,68.0,68.0,88.0,75.0,70.0,67.0,67.0,67.0,96.0],"z":[2372,2130,2130,2228,1773,1613,2278,2330,2288,2506,2100,2279,2379,2124,2807,1950,1836,1649,2003,2489,2391,2171,2702,2545,1795,1795,1990,2155,2930,2045,1945,2265,1985,2815,2720,1985,2070,1800,2560,2300,2515,2405,2135,1975,2020,1968,2019,2711,2542,2434,2265,2110,2110,1850,2145,2910,2420,2290,1755,1760,2065,1975,2050,1985,2210,2350,2615,2635,2900,2930,2025,1970,2160,2205,2245,1965,1965,1995,2665],"type":"scatter3d"},{"hovertemplate":"origin=europe\u003cbr\u003edisplacement=%{x}\u003cbr\u003ehorsepower=%{y}\u003cbr\u003eweight=%{z}\u003cbr\u003empg=%{marker.size}\u003cbr\u003emodel_year=%{marker.color}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"europe","marker":{"color":[70,70,70,70,70,71,71,71,71,72,72,72,72,72,73,73,73,73,73,73,73,74,74,74,74,74,74,75,75,75,75,75,75,76,76,76,76,76,76,76,76,77,77,77,77,78,78,78,78,78,78,79,79,79,79,80,80,80,80,80,80,80,80,81,81,81,82,82],"coloraxis":"coloraxis","opacity":0.7,"size":[26.0,25.0,24.0,25.0,26.0,28.0,30.0,30.0,27.0,23.0,18.0,22.0,21.0,26.0,26.0,26.0,29.0,24.0,20.0,19.0,24.0,29.0,26.0,26.0,24.0,26.0,31.0,25.0,29.0,23.0,23.0,22.0,25.0,28.0,25.0,27.0,29.0,29.5,20.0,19.0,16.5,36.0,29.0,30.5,21.5,43.1,20.3,17.0,21.6,16.2,31.5,31.9,25.4,27.2,37.3,41.5,34.3,44.3,43.4,36.4,30.0,29.8,35.0,33.0,28.1,30.7,36.0,44.0],"sizemode":"area","sizeref":0.1165,"symbol":"square"},"mode":"markers","name":"europe","scene":"scene","showlegend":true,"x":[97.0,110.0,107.0,104.0,121.0,116.0,79.0,88.0,97.0,97.0,121.0,121.0,120.0,96.0,97.0,98.0,68.0,116.0,114.0,121.0,121.0,98.0,79.0,97.0,90.0,116.0,79.0,90.0,90.0,115.0,120.0,121.0,121.0,107.0,116.0,101.0,90.0,97.0,130.0,120.0,168.0,79.0,97.0,97.0,121.0,90.0,131.0,163.0,121.0,163.0,89.0,89.0,183.0,141.0,91.0,98.0,97.0,90.0,90.0,121.0,146.0,89.0,122.0,105.0,141.0,145.0,105.0,97.0],"y":[46.0,87.0,90.0,95.0,113.0,90.0,70.0,76.0,60.0,54.0,112.0,76.0,87.0,69.0,46.0,90.0,49.0,75.0,91.0,112.0,110.0,83.0,67.0,78.0,75.0,75.0,67.0,71.0,70.0,95.0,88.0,98.0,115.0,86.0,81.0,83.0,70.0,71.0,102.0,88.0,120.0,58.0,78.0,78.0,110.0,48.0,103.0,125.0,115.0,133.0,71.0,71.0,77.0,71.0,69.0,76.0,78.0,48.0,48.0,67.0,67.0,62.0,88.0,74.0,80.0,76.0,74.0,52.0],"z":[1835,2672,2430,2375,2234,2123,2074,2065,1834,2254,2933,2511,2979,2189,1950,2265,1867,2158,2582,2868,2660,2219,1963,2300,2108,2246,2000,2223,1937,2694,2957,2945,2671,2464,2220,2202,1937,1825,3150,3270,3820,1825,1940,2190,2600,1985,2830,3140,2795,3410,1990,1925,3530,3190,2130,2144,2188,2085,2335,2950,3250,1845,2500,2190,3230,3160,1980,2130],"type":"scatter3d"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"scene":{"domain":{"x":[0.0,1.0],"y":[0.0,1.0]},"xaxis":{"title":{"text":"displacement"}},"yaxis":{"title":{"text":"horsepower"}},"zaxis":{"title":{"text":"weight"}}},"coloraxis":{"colorbar":{"title":{"text":"model_year"}},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"showscale":false},"legend":{"title":{"text":"origin"},"tracegroupgap":0,"itemsizing":"constant"},"height":800,"width":900},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('b2e9a25e-0ae6-4d54-b2c2-84cdf073593e');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>However, adding more features to our visualization can make our plot look messy and uninformative, and it can also be near impossible if we have a large number of features. The problem is that many datasets come with more than 5 features —— hundreds, even. Is it still possible to visualize all those features?</p>
+</section>
+<section id="dimensionality" class="level2" data-number="24.2">
+<h2 data-number="24.2" class="anchored" data-anchor-id="dimensionality"><span class="header-section-number">24.2</span> Dimensionality</h2>
+<p>Suppose we have a dataset of:</p>
+<ul>
+<li><span class="math inline">\(N\)</span> observations (datapoints/rows)</li>
+<li><span class="math inline">\(d\)</span> attributes (features/columns)</li>
+</ul>
+<p>Let’s “rename” this in terms of linear algebra so that we can be more clear with our wording. Using linear algebra, we can view our matrix as:</p>
+<ul>
+<li><span class="math inline">\(N\)</span> row vectors in a <span class="math inline">\(d\)</span>-Dimensional space, OR</li>
+<li><span class="math inline">\(d\)</span> column vectors in an <span class="math inline">\(N\)</span>-Dimensions space</li>
+</ul>
+<p>The <strong>intrinsic dimension</strong> of a dataset is the <em>minimal set of dimensions</em> needed to approximately represent the data. In linear algebra terms, it is the <strong>dimension of the column space</strong> of a matrix, or the number of linearly independent columns in a matrix; this is equivalently called the <strong>rank</strong> of a matrix.</p>
+<p>In the examples below, Dataset 1 has 2 dimensions because it has 2 linearly independent columns. Similarly, Dataset 2 has 3 dimensions because it has 3 linearly independent columns.</p>
+<center>
+<img src="images/dataset_dims.png" width="600vw">
+</center>
+<p><br></p>
+<p>What about Dataset 4 below?</p>
+<center>
+<img src="images/dataset4.png" width="400vw">
+</center>
+<p>It may be tempting to say that it has 4 dimensions, but the <code>Weight (lbs)</code> column is actually just a linear transformation of the <code>Weight (kg)</code> column. Thus, no new information is captured, and the matrix of our dataset has a (column) rank of 3! Therefore, despite having 4 columns, we still say that this data is 3-dimensional.</p>
+<p>Plotting the weight columns together reveals the key visual intuition. While the two columns visually span a 2D space as a line, the data does not deviate at all from that singular line. This means that one of the weight columns is redundant! Even given the option to cover the whole 2D space, the data below does not. It might as well not have this dimension, which is why we still do not consider the data below to span more than 1 dimension.</p>
+<center>
+<img src="images/dataset3.png" width="400vw">
+</center>
+<p>What happens when there are outliers? Below, we’ve added one outlier point to the dataset above, and just that one point is enough to change the rank of the matrix from 1 to 2 dimensions. However, the data is still <em>approximately</em> 1-dimensional.</p>
+<center>
+<img src="images/dataset3_outlier.png" width="400vw">
+</center>
+<p><strong>Dimensionality reduction</strong> is generally an <strong>approximation of the original data</strong> that’s achieved by <strong>projecting</strong> the data onto a desired dimension. In the example below, our original datapoints (blue dots) are 2-dimensional. We have a few choices if we want to project them down to 1-dimension: project them onto the <span class="math inline">\(x\)</span>-axis (left), project them onto the <span class="math inline">\(y\)</span>-axis (middle), or project them to a line <span class="math inline">\(mx + b\)</span> (right). The resulting datapoints after the projection is shown in red. Which projection do you think is better? How can we calculate that?</p>
+<center>
+<img src="images/diff_reductions.png" width="800vw">
+</center>
+<p><br></p>
+<p>In general, we want the projection which is the best approximation for the original data (the graph on the right). In other words, we want the projection that <em>captures the most variance</em> of the original data. In the next section, we’ll see how this is calculated.</p>
+</section>
+<section id="matrix-decomposition-factorization" class="level2" data-number="24.3">
+<h2 data-number="24.3" class="anchored" data-anchor-id="matrix-decomposition-factorization"><span class="header-section-number">24.3</span> Matrix Decomposition (Factorization)</h2>
+<p>One linear technique for dimensionality reduction is matrix decomposition, which is closely tied to matrix multiplication. In this section, we will decompose our data matrix <span class="math inline">\(X\)</span> into a lower-dimensional matrix <span class="math inline">\(Z\)</span> that approximately recovers the original data when multiplied by <span class="math inline">\(W\)</span>.</p>
+<center>
+<img src="images/factorization.png" width="500vw">
+</center>
+<p>First, consider the matrix multiplication example below:</p>
+<center>
+<img src="images/matmul.png" width="700vw">
+</center>
+<ul>
+<li>For table 1, each row of the fruits matrix represents one bowl of fruit; for example, the first bowl/row has 2 apples, 2 lemons, and 2 melons.</li>
+<li>For table 2, each column of the dollars matrix represents the cost of fruit at a store; for example, the first store/column charges 2 dollars for an apple, 1 dollar for a lemon, and 4 dollars for a melon.</li>
+<li>The output is the cost of each bowl at each store.</li>
+</ul>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Linear Algebra Review: Matrix Multiplication
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>In general, there are two ways to interpret matrix multiplication:</p>
+<ol type="1">
+<li>Each <em>datapoint</em> in our output is a <em>dot product</em> between a row in the data matrix and a column in the operations matrix. In this view, we perform multiple linear operations on the data
+<center>
+<img src="images/matmul2.png" width="550vw">
+</center></li>
+<li>Each <em>column</em> in our output is a <em>linear transformation</em> of the original columns based on a column in the transformation matrix
+<center>
+<img src="images/matmul3.png" width="550vw">
+</center></li>
+</ol>
+<p>We will use the second interpretation to link matrix multiplication with matrix decomposition, where we receive a lower dimensional representation of data along with a transformation matrix.</p>
+</div>
+</div>
+<p><strong>Matrix decomposition</strong> (a.k.a <strong>matrix factorization</strong>) is the opposite of matrix multiplication. Instead of multiplying two matrices, we want to <em>decompose</em> a single matrix into 2 separate matrices. Just like with real numbers, there are infinite ways to decompose a matrix into a product of two matrices. For example, <span class="math inline">\(9.9\)</span> can be decomposed as <span class="math inline">\(1.1 * 9\)</span>, <span class="math inline">\(3.3 * 3.3\)</span>, <span class="math inline">\(1 * 9.9\)</span>, etc. Additionally, the sizes of the 2 decomposed matrices can vary drastically. In the example below, the first factorization (top) multiplies a <span class="math inline">\(3x2\)</span> matrix by a <span class="math inline">\(2x3\)</span> matrix while the second factorization (bottom) multiplies a <span class="math inline">\(3x3\)</span> matrix by a <span class="math inline">\(3x3\)</span> matrix; both result in the original matrix on the right.</p>
+<center>
+<img src="images/matrix_decomp.png" width="550vw">
+</center>
+<p><br></p>
+<p>We can even expand the <span class="math inline">\(3x3\)</span> matrices to <span class="math inline">\(3x4\)</span> and <span class="math inline">\(4x3\)</span> (shown below as the factorization on top), but this defeats the point of dimensionality reduction since we’re adding more “useless” dimensions. On the flip side, we also can’t reduce the dimension to <span class="math inline">\(3x1\)</span> and <span class="math inline">\(1x3\)</span> (shown below as the factorization on the bottom); since the rank of the original matrix is greater than 1, this decomposition will not result in the original matrix.</p>
+<center>
+<img src="images/factorization_constraints.png" width="550vw">
+</center>
+<p><br> In practice, we often work with datasets containing many features, so we usually want to construct decompositions where the dimensionality is below the rank of the original matrix. While this does not recover the data exactly, we can still provide <em>approximate reconstructions</em> of the matrix.</p>
+<p>In the next section, we will discuss a method to automatically and approximately factorize data. This avoids redundant features and makes computation easier because we can train on less data. Since some approximations are better than others, we will also discuss how the method helps us capture a lot of information in a low number of dimensions.</p>
+</section>
+<section id="principal-component-analysis-pca" class="level2" data-number="24.4">
+<h2 data-number="24.4" class="anchored" data-anchor-id="principal-component-analysis-pca"><span class="header-section-number">24.4</span> Principal Component Analysis (PCA)</h2>
+<p>In PCA, our goal is to transform observations from high-dimensional data down to low dimensions (often 2, as most visualizations are 2D) through linear transformations. In other words, we want to find a linear transformation that creates a low-dimension representation that captures as much of the original data’s <em>total variance</em> as possible.</p>
+<center>
+<img src="images/PCA_1.png" width="600vw">
+</center>
+<p>We often perform PCA during the Exploratory Data Analysis (EDA) stage of our data science lifecycle when we don’t know what model to use. It helps us with:</p>
+<ul>
+<li>Visually identifying clusters of similar observations in high dimensions.</li>
+<li>Removing irrelevant dimensions if we suspect that the dataset is inherently low rank. For example, if the columns are collinear, there are many attributes, but only a few mostly determine the rest through linear associations.</li>
+<li>Creating a transformed dataset of decorrelated features.</li>
+</ul>
+<center>
+<img src="images/pca_example.png" width="550vw">
+</center>
+<p>There are two equivalent ways of framing PCA:</p>
+<ol type="1">
+<li>Finding directions of <strong>maximum variability</strong> in the data.</li>
+<li>Finding the low dimensional (rank) matrix factorization that <strong>best approximates the data</strong>.</li>
+</ol>
+<p>To execute the first approach of <strong>variance maximization</strong> framing (more common), we can find the variances of each attribute with <code>np.var</code> and then keep the <span class="math inline">\(k\)</span> attributes with the highest variance. However, this approach limits us to work with attributes individually; it cannot resolve collinearity, and we cannot combine features.</p>
+<p>The second approach uses PCA to construct <strong>principal components</strong> with the most variance in the data (even higher than the first approach) using <strong>linear combinations of features</strong>. We’ll describe the procedure in the next section.</p>
+<section id="pca-procedure-overview" class="level3" data-number="24.4.1">
+<h3 data-number="24.4.1" class="anchored" data-anchor-id="pca-procedure-overview"><span class="header-section-number">24.4.1</span> PCA Procedure (Overview)</h3>
+<p>To perform PCA on a matrix:</p>
+<ol type="1">
+<li><strong>Center</strong> the data matrix by subtracting the mean of each attribute column.</li>
+<li>To find the <span class="math inline">\(i\)</span>-th <strong>principal component</strong>, <span class="math inline">\(v_i\)</span>:
+<ol type="1">
+<li><span class="math inline">\(v\)</span> is a <strong>unit vector</strong> that linearly combines the attributes.</li>
+<li><span class="math inline">\(v\)</span> gives a one-dimensional projection of the data.</li>
+<li><span class="math inline">\(v\)</span> is chosen to <strong>maximize the variance</strong> along the projection onto <span class="math inline">\(v\)</span>. This is equivalent to <strong>minimizing the sum of squared distances</strong> between each point and its projection onto <span class="math inline">\(v\)</span>.</li>
+<li>Choose <span class="math inline">\(v\)</span> such that it is orthogonal to all previous principal components.</li>
+</ol></li>
+</ol>
+<p>The <span class="math inline">\(k\)</span> principal components capture the most variance of any <span class="math inline">\(k\)</span>-dimensional reduction of the data matrix.</p>
+<p>In practice, however, we don’t carry out the procedures in step 2 because they take too long to compute. Instead, we use singular value decomposition (SVD) to find all principal components efficiently.</p>
+</section>
+<section id="deriving-pca-as-error-minimization" class="level3" data-number="24.4.2">
+<h3 data-number="24.4.2" class="anchored" data-anchor-id="deriving-pca-as-error-minimization"><span class="header-section-number">24.4.2</span> Deriving PCA as Error Minimization</h3>
+<p>In this section, we will derive PCA keeping the following goal in mind: minimize the reconstruction loss for our matrix factorization model. You are not expected to be able to be able to redo this derivation, but understanding the derivation may help with future assignments.</p>
+<p>Given a matrix <span class="math inline">\(X\)</span> with <span class="math inline">\(n\)</span> rows and <span class="math inline">\(d\)</span> columns, our goal is to find its best decomposition such that <span class="math display">\[X \approx Z W\]</span> Z has <span class="math inline">\(n\)</span> rows and <span class="math inline">\(k\)</span> columns; W has <span class="math inline">\(k\)</span> rows and <span class="math inline">\(d\)</span> columns.</p>
+<center>
+<img src="images/factorization.png" width="500vw">
+</center>
+<p>To measure the accuracy of our reconstruction, we define the <strong>reconstruction loss</strong> below, where <span class="math inline">\(X_i\)</span> is the row vector of <span class="math inline">\(X\)</span>, and <span class="math inline">\(Z_i\)</span> is the row vector of <span class="math inline">\(Z\)</span>:</p>
+<center>
+<img src="images/reconstruction_loss.png" width="450vw">
+</center>
+<!-- $$\begin{aligned}
+L(Z,W) &= \frac{1}{n}\sum_{i=1}^{n}||X_i - Z_{i}W||^2 \\
+&= \frac{1}{n}\sum_{i=1}^{n}(X_i - Z_{i}W)(X_i - Z_{i}W)^T \\
+\end{aligned}$$ -->
+<p>There are many solutions to the above, so let’s constrain our model such that <span class="math inline">\(W\)</span> is a <strong>row-orthonormal matrix</strong> (i.e.&nbsp;<span class="math inline">\(WW^T=I\)</span>) where the rows of <span class="math inline">\(W\)</span> are our principal components.</p>
+<p>In our derivation, let’s first work with the case where <span class="math inline">\(k=1\)</span>. Here Z will be an <span class="math inline">\(n \times 1\)</span> vector and W will be a <span class="math inline">\(1 \times d\)</span> vector.</p>
+<p><span class="math display">\[\begin{aligned}
+L(z,w) &amp;= \frac{1}{n}\sum_{i=1}^{n}(X_i - z_{i}w)(X_i - z_{i}w)^T \\
+&amp;= \frac{1}{n}\sum_{i=1}^{n}(X_{i}X_{i}^T - 2z_{i}X_{i}w^T + z_{i}^{2}ww^T) &amp; \text{(expand the loss)} \\
+= \frac{1}{n}\sum_{i=1}^{n}(-2z_{i}X_{i}w^T + z_{i}^{2}) &amp; \text{(First term is constant and }ww^T=1\text{ by orthonormality)} \\
+\end{aligned}\]</span></p>
+<p>Now, we can take the derivative with respect to <span class="math inline">\(Z_i\)</span>. <span class="math display">\[\begin{aligned}
+\frac{\partial{L(Z,W)}}{\partial{z_i}} &amp;= \frac{1}{n}(-2X_{i}w^T + 2z_{i}) \\
+z_i &amp;= X_iw^T &amp; \text{(Setting derivative equal to 0 and solving for }z_i\text{)}\end{aligned}\]</span></p>
+<p>We can now substitute our solution for <span class="math inline">\(z_i\)</span> in our loss function:</p>
+<p><span class="math display">\[\begin{aligned}
+L(z,w) &amp;= \frac{1}{n}\sum_{i=1}^{n}(-2z_{i}X_{i}w^T + z_{i}^{2}) \\
+L(z=X_iw^T,w) &amp;= \frac{1}{n}\sum_{i=1}^{n}(-2X_iw^TX_{i}w^T + (X_iw^T)^{2}) \\
+&amp;= \frac{1}{n}\sum_{i=1}^{n}(-X_iw^TX_{i}w^T) \\
+&amp;= \frac{1}{n}\sum_{i=1}^{n}(-wX_{i}^TX_{i}w^T) \\
+&amp;= -w\frac{1}{n}\sum_{i=1}^{n}(X_i^TX_{i})w^T \\
+&amp;= -w\Sigma w^T
+\end{aligned}\]</span></p>
+<p>Now, we need to minimize our loss with respect to <span class="math inline">\(w\)</span>. Since we have a negative sign, one way we can do this is by making <span class="math inline">\(w\)</span> really big. However, we also have the orthonormality constraint <span class="math inline">\(ww^T=1\)</span>. To incorporate this constraint into the equation, we can add a Lagrange multiplier, <span class="math inline">\(\lambda\)</span>. Note that lagrangian multipliers are out of scope for Data 100.</p>
+<p><span class="math display">\[
+L(w,\lambda) = -w\Sigma w^T + \lambda(ww^T-1)
+\]</span></p>
+<p>Taking the derivative with respect to <span class="math inline">\(w\)</span>, <span class="math display">\[\begin{aligned}
+\frac{\partial{L(w,\lambda)}}{w} &amp;= -2\Sigma w^T + 2\lambda w^T \\
+2\Sigma w^T - 2\lambda w^T &amp;= 0 &amp; \text{(Setting derivative equal to 0)} \\
+\Sigma w^T &amp;= \lambda w^T \\
+\end{aligned}\]</span></p>
+<p>This result implies that:</p>
+<ul>
+<li><span class="math inline">\(w\)</span> is a <strong>unitary eigenvector</strong> of the covariance matrix. This means that <span class="math inline">\(||w||^2 = ww^T = 1\)</span></li>
+<li>The error is minimized when <span class="math inline">\(w\)</span> is the eigenvector with the largest eigenvalue <span class="math inline">\(\lambda\)</span>.</li>
+</ul>
+<p>This derivation can inductively be used for the next (second) principal component (not shown).</p>
+<p>The final takeaway from this derivation is that the <strong>principal components</strong> are the <strong>eigenvectors</strong> with the <strong>largest eigenvalues</strong> of the <strong>covariance matrix</strong>. These are the <strong>directions</strong> of the <strong>maximum variance</strong> of the data. We can construct the <strong>latent factors (the Z matrix)</strong> by <strong>projecting</strong> the centered data X onto the principal component vectors:</p>
+<center>
+<img src="images/optimization_takeaways.png" width="450vw">
+</center>
+
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../logistic_regression_2/logistic_reg_2.html" class="pagination-link" aria-label="Logistic Regression II">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../pca_2/pca_2.html" class="pagination-link" aria-label="PCA II">
+        <span class="nav-page-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/pca_2/images/Z.png b/docs/pca_2/images/Z.png
new file mode 100644
index 000000000..63b012730
Binary files /dev/null and b/docs/pca_2/images/Z.png differ
diff --git a/docs/pca_2/images/diag_matrix.png b/docs/pca_2/images/diag_matrix.png
new file mode 100644
index 000000000..b8e5c8154
Binary files /dev/null and b/docs/pca_2/images/diag_matrix.png differ
diff --git a/docs/pca_2/images/lin_reg.png b/docs/pca_2/images/lin_reg.png
new file mode 100644
index 000000000..f1806cc49
Binary files /dev/null and b/docs/pca_2/images/lin_reg.png differ
diff --git a/docs/pca_2/images/lin_reg_reverse.png b/docs/pca_2/images/lin_reg_reverse.png
new file mode 100644
index 000000000..0238aa061
Binary files /dev/null and b/docs/pca_2/images/lin_reg_reverse.png differ
diff --git a/docs/pca_2/images/mnist.png b/docs/pca_2/images/mnist.png
new file mode 100644
index 000000000..d40283d42
Binary files /dev/null and b/docs/pca_2/images/mnist.png differ
diff --git a/docs/pca_2/images/orthonormal.png b/docs/pca_2/images/orthonormal.png
new file mode 100644
index 000000000..9256b5c21
Binary files /dev/null and b/docs/pca_2/images/orthonormal.png differ
diff --git a/docs/pca_2/images/pca_plot.png b/docs/pca_2/images/pca_plot.png
new file mode 100644
index 000000000..3aad5a3bf
Binary files /dev/null and b/docs/pca_2/images/pca_plot.png differ
diff --git a/docs/pca_2/images/rank1.png b/docs/pca_2/images/rank1.png
new file mode 100644
index 000000000..f2df603c3
Binary files /dev/null and b/docs/pca_2/images/rank1.png differ
diff --git a/docs/pca_2/images/rotate_center_plot.png b/docs/pca_2/images/rotate_center_plot.png
new file mode 100644
index 000000000..878afe51e
Binary files /dev/null and b/docs/pca_2/images/rotate_center_plot.png differ
diff --git a/docs/pca_2/images/s.png b/docs/pca_2/images/s.png
new file mode 100644
index 000000000..e4c769995
Binary files /dev/null and b/docs/pca_2/images/s.png differ
diff --git a/docs/pca_2/images/scree_plot.png b/docs/pca_2/images/scree_plot.png
new file mode 100644
index 000000000..c4400a020
Binary files /dev/null and b/docs/pca_2/images/scree_plot.png differ
diff --git a/docs/pca_2/images/slide10.png b/docs/pca_2/images/slide10.png
new file mode 100644
index 000000000..e7578f8c0
Binary files /dev/null and b/docs/pca_2/images/slide10.png differ
diff --git a/docs/pca_2/images/slide16.png b/docs/pca_2/images/slide16.png
new file mode 100644
index 000000000..9b46197c1
Binary files /dev/null and b/docs/pca_2/images/slide16.png differ
diff --git a/docs/pca_2/images/slide17_2.png b/docs/pca_2/images/slide17_2.png
new file mode 100644
index 000000000..5261c9c3c
Binary files /dev/null and b/docs/pca_2/images/slide17_2.png differ
diff --git a/docs/pca_2/images/slide21.png b/docs/pca_2/images/slide21.png
new file mode 100644
index 000000000..b008c0969
Binary files /dev/null and b/docs/pca_2/images/slide21.png differ
diff --git a/docs/pca_2/images/u.png b/docs/pca_2/images/u.png
new file mode 100644
index 000000000..e18743ae6
Binary files /dev/null and b/docs/pca_2/images/u.png differ
diff --git a/docs/pca_2/images/v.png b/docs/pca_2/images/v.png
new file mode 100644
index 000000000..7a4ec99a6
Binary files /dev/null and b/docs/pca_2/images/v.png differ
diff --git a/docs/pca_2/pca_2.html b/docs/pca_2/pca_2.html
new file mode 100644
index 000000000..f757ae5df
--- /dev/null
+++ b/docs/pca_2/pca_2.html
@@ -0,0 +1,3169 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>25&nbsp; PCA II – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../clustering/clustering.html" rel="next">
+<link href="../pca_1/pca_1.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+<script type="text/javascript">
+window.PlotlyConfig = {MathJaxConfig: 'local'};
+if (window.MathJax && window.MathJax.Hub && window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: "STIX-Web"}});}
+if (typeof require !== 'undefined') {
+require.undef("plotly");
+requirejs.config({
+    paths: {
+        'plotly': ['https://cdn.plot.ly/plotly-2.34.0.min']
+    }
+});
+require(['plotly'], function(Plotly) {
+    window._Plotly = Plotly;
+});
+}
+</script>
+
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../pca_2/pca_2.html"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#dimensionality-reduction" id="toc-dimensionality-reduction" class="nav-link active" data-scroll-target="#dimensionality-reduction"><span class="header-section-number">25.1</span> Dimensionality Reduction</a>
+  <ul>
+  <li><a href="#loss-minimization" id="toc-loss-minimization" class="nav-link" data-scroll-target="#loss-minimization"><span class="header-section-number">25.1.1</span> Loss Minimization</a></li>
+  </ul></li>
+  <li><a href="#singular-value-decomposition-svd" id="toc-singular-value-decomposition-svd" class="nav-link" data-scroll-target="#singular-value-decomposition-svd"><span class="header-section-number">25.2</span> Singular Value Decomposition (SVD)</a>
+  <ul>
+  <li><a href="#u" id="toc-u" class="nav-link" data-scroll-target="#u"><span class="header-section-number">25.2.1</span> <span class="math inline">\(U\)</span></a></li>
+  <li><a href="#s" id="toc-s" class="nav-link" data-scroll-target="#s"><span class="header-section-number">25.2.2</span> <span class="math inline">\(S\)</span></a></li>
+  <li><a href="#vt" id="toc-vt" class="nav-link" data-scroll-target="#vt"><span class="header-section-number">25.2.3</span> <span class="math inline">\(V^T\)</span></a></li>
+  <li><a href="#svd-in-numpy" id="toc-svd-in-numpy" class="nav-link" data-scroll-target="#svd-in-numpy"><span class="header-section-number">25.2.4</span> SVD in <code>NumPy</code></a></li>
+  </ul></li>
+  <li><a href="#pca-with-svd" id="toc-pca-with-svd" class="nav-link" data-scroll-target="#pca-with-svd"><span class="header-section-number">25.3</span> PCA with SVD</a>
+  <ul>
+  <li><a href="#deriving-principal-components-from-svd" id="toc-deriving-principal-components-from-svd" class="nav-link" data-scroll-target="#deriving-principal-components-from-svd"><span class="header-section-number">25.3.1</span> Deriving Principal Components From SVD</a></li>
+  <li><a href="#pca-visualization" id="toc-pca-visualization" class="nav-link" data-scroll-target="#pca-visualization"><span class="header-section-number">25.3.2</span> PCA Visualization</a></li>
+  <li><a href="#using-principal-components" id="toc-using-principal-components" class="nav-link" data-scroll-target="#using-principal-components"><span class="header-section-number">25.3.3</span> Using Principal Components</a></li>
+  <li><a href="#code-demo" id="toc-code-demo" class="nav-link" data-scroll-target="#code-demo"><span class="header-section-number">25.3.4</span> Code Demo</a></li>
+  </ul></li>
+  <li><a href="#data-variance-and-centering" id="toc-data-variance-and-centering" class="nav-link" data-scroll-target="#data-variance-and-centering"><span class="header-section-number">25.4</span> Data Variance and Centering</a></li>
+  <li><a href="#interpreting-pca" id="toc-interpreting-pca" class="nav-link" data-scroll-target="#interpreting-pca"><span class="header-section-number">25.5</span> Interpreting PCA</a>
+  <ul>
+  <li><a href="#pca-plot" id="toc-pca-plot" class="nav-link" data-scroll-target="#pca-plot"><span class="header-section-number">25.5.1</span> PCA Plot</a></li>
+  <li><a href="#scree-plots" id="toc-scree-plots" class="nav-link" data-scroll-target="#scree-plots"><span class="header-section-number">25.5.2</span> Scree Plots</a></li>
+  <li><a href="#biplots" id="toc-biplots" class="nav-link" data-scroll-target="#biplots"><span class="header-section-number">25.5.3</span> Biplots</a></li>
+  </ul></li>
+  <li><a href="#example-1-house-of-representatives-voting" id="toc-example-1-house-of-representatives-voting" class="nav-link" data-scroll-target="#example-1-house-of-representatives-voting"><span class="header-section-number">25.6</span> Example 1: House of Representatives Voting</a>
+  <ul>
+  <li><a href="#pca-with-svd-1" id="toc-pca-with-svd-1" class="nav-link" data-scroll-target="#pca-with-svd-1"><span class="header-section-number">25.6.1</span> PCA with SVD</a></li>
+  <li><a href="#exploring-the-principal-components" id="toc-exploring-the-principal-components" class="nav-link" data-scroll-target="#exploring-the-principal-components"><span class="header-section-number">25.6.2</span> Exploring the Principal Components</a></li>
+  <li><a href="#biplot" id="toc-biplot" class="nav-link" data-scroll-target="#biplot"><span class="header-section-number">25.6.3</span> Biplot</a></li>
+  </ul></li>
+  <li><a href="#example-2-image-classification" id="toc-example-2-image-classification" class="nav-link" data-scroll-target="#example-2-image-classification"><span class="header-section-number">25.7</span> Example 2: Image Classification</a>
+  <ul>
+  <li><a href="#raw-data" id="toc-raw-data" class="nav-link" data-scroll-target="#raw-data"><span class="header-section-number">25.7.1</span> Raw Data</a></li>
+  <li><a href="#pca-with-sklearn" id="toc-pca-with-sklearn" class="nav-link" data-scroll-target="#pca-with-sklearn"><span class="header-section-number">25.7.2</span> PCA with <code>sklearn</code></a></li>
+  <li><a href="#examining-pca-results" id="toc-examining-pca-results" class="nav-link" data-scroll-target="#examining-pca-results"><span class="header-section-number">25.7.3</span> Examining PCA Results</a></li>
+  </ul></li>
+  <li><a href="#why-perform-pca" id="toc-why-perform-pca" class="nav-link" data-scroll-target="#why-perform-pca"><span class="header-section-number">25.8</span> Why Perform PCA</a>
+  <ul>
+  <li><a href="#why-pca-then-model" id="toc-why-pca-then-model" class="nav-link" data-scroll-target="#why-pca-then-model"><span class="header-section-number">25.8.1</span> Why PCA, then Model?</a></li>
+  </ul></li>
+  <li><a href="#bonus-applications-of-pca" id="toc-bonus-applications-of-pca" class="nav-link" data-scroll-target="#bonus-applications-of-pca"><span class="header-section-number">25.9</span> (Bonus) Applications of PCA</a>
+  <ul>
+  <li><a href="#pca-in-biology" id="toc-pca-in-biology" class="nav-link" data-scroll-target="#pca-in-biology"><span class="header-section-number">25.9.1</span> PCA in Biology</a></li>
+  </ul></li>
+  <li><a href="#bonus-pca-vs.-regression" id="toc-bonus-pca-vs.-regression" class="nav-link" data-scroll-target="#bonus-pca-vs.-regression"><span class="header-section-number">25.10</span> (Bonus) PCA vs.&nbsp;Regression</a>
+  <ul>
+  <li><a href="#regression-minimizing-horizontalverticle-error" id="toc-regression-minimizing-horizontalverticle-error" class="nav-link" data-scroll-target="#regression-minimizing-horizontalverticle-error"><span class="header-section-number">25.10.1</span> Regression: Minimizing Horizontal/Verticle Error</a></li>
+  <li><a href="#svd-minimizing-perpendicular-error" id="toc-svd-minimizing-perpendicular-error" class="nav-link" data-scroll-target="#svd-minimizing-perpendicular-error"><span class="header-section-number">25.10.2</span> SVD: Minimizing Perpendicular Error</a></li>
+  <li><a href="#beyond-1d-and-2d" id="toc-beyond-1d-and-2d" class="nav-link" data-scroll-target="#beyond-1d-and-2d"><span class="header-section-number">25.10.3</span> Beyond 1D and 2D</a></li>
+  </ul></li>
+  <li><a href="#bonus-automatic-factorization" id="toc-bonus-automatic-factorization" class="nav-link" data-scroll-target="#bonus-automatic-factorization"><span class="header-section-number">25.11</span> (Bonus) Automatic Factorization</a></li>
+  <li><a href="#bonus-proof-of-component-score" id="toc-bonus-proof-of-component-score" class="nav-link" data-scroll-target="#bonus-proof-of-component-score"><span class="header-section-number">25.12</span> (Bonus) Proof of Component Score</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<ul>
+<li>Dissect Singular Value Decomposition (SVD) and use it to calculate principal components</li>
+<li>Develop a deeper understanding of how to interpret Principal Component Analysis (PCA)</li>
+<li>See applications of PCA in some real-world contexts</li>
+</ul>
+</div>
+</div>
+<section id="dimensionality-reduction" class="level2" data-number="25.1">
+<h2 data-number="25.1" class="anchored" data-anchor-id="dimensionality-reduction"><span class="header-section-number">25.1</span> Dimensionality Reduction</h2>
+<p>We often work with high-dimensional data that contain <em>many</em> columns/features. Given all these dimensions, this data can be difficult to visualize and model. However, not all the data in this high-dimensional space is useful —— there could be repeated features or outliers that make the data seem more complex than it really is. The most concise representation of high-dimensional data is its <strong>intrinsic dimension</strong>. Our goal with this lecture is to use <strong>dimensionality reduction</strong> to find the intrinsic dimension of a high-dimensional dataset. In other words, we want to find a smaller set of new features/columns that approximates the original data well without losing that much information. This is especially useful because this smaller set of features allows us to better visualize the data and do EDA to understand which modeling techniques would fit the data well.</p>
+<section id="loss-minimization" class="level3" data-number="25.1.1">
+<h3 data-number="25.1.1" class="anchored" data-anchor-id="loss-minimization"><span class="header-section-number">25.1.1</span> Loss Minimization</h3>
+<p>In order to find the intrinsic dimension of a high-dimensional dataset, we’ll use techniques from linear algebra. Suppose we have a high-dimensional dataset, <span class="math inline">\(X\)</span>, that has <span class="math inline">\(n\)</span> rows and <span class="math inline">\(d\)</span> columns. We want to factor (split) <span class="math inline">\(X\)</span> into two matrices, <span class="math inline">\(Z\)</span> and <span class="math inline">\(W\)</span>. <span class="math inline">\(Z\)</span> has <span class="math inline">\(n\)</span> rows and <span class="math inline">\(k\)</span> columns; <span class="math inline">\(W\)</span> has <span class="math inline">\(k\)</span> rows and <span class="math inline">\(d\)</span> columns.</p>
+<p><span class="math display">\[ X \approx ZW\]</span></p>
+<p>We can reframe this problem as a loss function: in other words, if we want <span class="math inline">\(X\)</span> to roughly equal <span class="math inline">\(ZW\)</span>, their difference should be as small as possible, ideally 0. This difference becomes our loss function, <span class="math inline">\(L(Z, W)\)</span>:</p>
+<p><span class="math display">\[L(Z, W) = \frac{1}{n}\sum_{i=1}^{n}||X_i - Z_iW||^2\]</span></p>
+<p>Breaking down the variables in this formula:</p>
+<ul>
+<li><span class="math inline">\(X_i\)</span>: A row vector from the original data matrix <span class="math inline">\(X\)</span>, which we can assume is centered to a mean of 0.</li>
+<li><span class="math inline">\(Z_i\)</span>: A row vector from the lower-dimension matrix <span class="math inline">\(Z\)</span>. The rows of <span class="math inline">\(Z\)</span> are also known as <strong>latent vectors</strong> and are used for EDA.</li>
+<li><span class="math inline">\(W\)</span>: The rows of <span class="math inline">\(W\)</span> are the <strong>principal components</strong>. We constrain our model so that <span class="math inline">\(W\)</span> is a row-orthonormal matrix (e.g., <span class="math inline">\(WW^T = I\)</span>).</li>
+</ul>
+<p>Using calculus and optimization techniques (take EECS 127 if you’re interested!), we find that this loss is minimized when <span class="math display">\[Z = XW^T\]</span> The proof for this is out of scope for Data 100, but for those who are interested, we:</p>
+<ul>
+<li>Use Lagrangian multipliers to introduce the orthonormality constraint on <span class="math inline">\(W\)</span>.</li>
+<li>Took the derivative with respect to <span class="math inline">\(W\)</span> (which requires vector calculus) and solve for 0.</li>
+</ul>
+<p>This gives us a very cool result of</p>
+<p><span class="math display">\[\Sigma w^T = \lambda w^T\]</span></p>
+<p><span class="math inline">\(\Sigma\)</span> is the covariance matrix of <span class="math inline">\(X\)</span>. The equation above implies that:</p>
+<ol type="1">
+<li><span class="math inline">\(w\)</span> is a <strong>unitary eigenvector</strong> of the covariance matrix <span class="math inline">\(\Sigma\)</span>. In other words, its norm is equal to 1: <span class="math inline">\(||w||^2 = ww^T = 1\)</span>.</li>
+<li>The loss is minimized when <span class="math inline">\(w\)</span> is the eigenvector with the <strong>largest eigenvalue</strong> <span class="math inline">\(\lambda\)</span>.</li>
+</ol>
+<p>This tells us that the principal components (rows of <span class="math inline">\(W\)</span>) are the eigenvectors with the largest eigenvalues of the covariance matrix <span class="math inline">\(\Sigma\)</span>. They represent the directions of <strong>maximum variance</strong> in the data. We can construct the latent factors, or the <span class="math inline">\(Z\)</span> matrix, by projecting the centered data <span class="math inline">\(X\)</span> onto the principal component vectors, <span class="math inline">\(W^T\)</span>.</p>
+<center>
+<img src="images/slide10.png" width="400vw">
+</center>
+<p>But how do we compute the eigenvectors of <span class="math inline">\(\Sigma\)</span>? Let’s dive into SVD to answer this question.</p>
+</section>
+</section>
+<section id="singular-value-decomposition-svd" class="level2" data-number="25.2">
+<h2 data-number="25.2" class="anchored" data-anchor-id="singular-value-decomposition-svd"><span class="header-section-number">25.2</span> Singular Value Decomposition (SVD)</h2>
+<p>Singular value decomposition (SVD) is an important concept in linear algebra. Since this class requires a linear algebra course (MATH 54, MATH 56, or EECS 16A) as a pre/co-requisite, we assume you have taken or are taking a linear algebra course, so we won’t explain SVD in its entirety. In particular, we will go over:</p>
+<ul>
+<li>Why SVD is a valid decomposition of rectangular matrices</li>
+<li>Why PCA is an application of SVD</li>
+</ul>
+<p>We will not dive deep into the theory and details of SVD. Instead, we will only cover what is needed for a data science interpretation. If you’d like more information, check out <a href="https://inst.eecs.berkeley.edu/~ee16b/sp23/notes/sp23/note14.pdf">EECS 16B Note 14</a> or <a href="https://inst.eecs.berkeley.edu/~ee16b/sp23/notes/sp23/note15.pdf">EECS 16B Note 15</a>.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+[Linear Algebra Review] Orthonormality
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Orthonormal is a combination of two words: orthogonal and normal.</p>
+<p>When we say the columns of a matrix are orthonormal, we know that:</p>
+<ol type="1">
+<li>The columns are all orthogonal to each other (all pairs of columns have a dot product of zero)</li>
+<li>All columns are unit vectors (the length of each column vector is 1)
+<center>
+<img src="images/orthonormal.png" width="400vw">
+</center></li>
+</ol>
+<p>Orthonormal matrices have a few important properties:</p>
+<ul>
+<li><strong>Orthonormal inverse</strong>: If an <span class="math inline">\(m \times n\)</span> matrix <span class="math inline">\(Q\)</span> has orthonormal columns, <span class="math inline">\(QQ^T= Iₘ\)</span> and <span class="math inline">\(Q^TQ=Iₙ\)</span>.</li>
+<li><strong>Rotation of coordinates</strong>: The linear transformation represented by an orthonormal matrix is often a rotation (and less often a reflection). We can imagine columns of the matrix as where the unit vectors of the original space will land.</li>
+</ul>
+</div>
+</div>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+[Linear Algebra Review] Diagonal Matrices
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p><strong>Diagonal matrices</strong> are square matrices with non-zero values on the diagonal axis and zeros everywhere else.</p>
+<p>Right-multiplied diagonal matrices scale each column up or down by a constant factor. Geometrically, this transformation can be viewed as scaling the coordinate system.</p>
+<center>
+<img src="images/diag_matrix.png" width="600vw">
+</center>
+</div>
+</div>
+<p>Singular value decomposition (SVD) describes a matrix <span class="math inline">\(X\)</span>’s decomposition into three matrices: <span class="math display">\[ X = U S V^T \]</span></p>
+<p>Let’s break down each of these terms one by one.</p>
+<section id="u" class="level3" data-number="25.2.1">
+<h3 data-number="25.2.1" class="anchored" data-anchor-id="u"><span class="header-section-number">25.2.1</span> <span class="math inline">\(U\)</span></h3>
+<ul>
+<li><span class="math inline">\(U\)</span> is an <span class="math inline">\(n \times d\)</span> matrix: <span class="math inline">\(U \in \mathbb{R}^{n \times d}\)</span>.</li>
+<li>Its columns are <strong>orthonormal</strong>.
+<ul>
+<li><span class="math inline">\(\vec{u_i}^T\vec{u_j} = 0\)</span> for all pairs <span class="math inline">\(i, j\)</span>.</li>
+<li>All vectors <span class="math inline">\(\vec{u_i}\)</span> are unit vectors where <span class="math inline">\(|| \vec{u_i} || = 1\)</span> .</li>
+</ul></li>
+<li>Columns of U are called the <strong>left singular vectors</strong> and are <strong>eigenvectors</strong> of <span class="math inline">\(XX^T\)</span>.</li>
+<li><span class="math inline">\(UU^T = I_n\)</span> and <span class="math inline">\(U^TU = I_d\)</span>.</li>
+<li>We can think of <span class="math inline">\(U\)</span> as a rotation.</li>
+</ul>
+<center>
+<img src="images/u.png" width="600vw">
+</center>
+</section>
+<section id="s" class="level3" data-number="25.2.2">
+<h3 data-number="25.2.2" class="anchored" data-anchor-id="s"><span class="header-section-number">25.2.2</span> <span class="math inline">\(S\)</span></h3>
+<ul>
+<li><span class="math inline">\(S\)</span> is a <span class="math inline">\(d \times d\)</span> matrix: <span class="math inline">\(S \in \mathbb{R}^{d \times d}\)</span>.</li>
+<li>The majority of the matrix is zero.</li>
+<li>It has <span class="math inline">\(r\)</span> <strong>non-zero</strong> <strong>singular values</strong>, and <span class="math inline">\(r\)</span> is the rank of <span class="math inline">\(X\)</span>. Note that rank <span class="math inline">\(r \leq d\)</span>.</li>
+<li>Diagonal values (<strong>singular values</strong> <span class="math inline">\(s_1, s_2, ... s_r\)</span>), are <strong>non-negative</strong> ordered from largest to smallest: <span class="math inline">\(s_1 \ge s_2 \ge ... \ge s_r &gt; 0\)</span>.</li>
+<li>We can think of <span class="math inline">\(S\)</span> as a scaling operation.</li>
+</ul>
+<center>
+<img src="images/s.png" width="400vw">
+</center>
+</section>
+<section id="vt" class="level3" data-number="25.2.3">
+<h3 data-number="25.2.3" class="anchored" data-anchor-id="vt"><span class="header-section-number">25.2.3</span> <span class="math inline">\(V^T\)</span></h3>
+<ul>
+<li><span class="math inline">\(V^T\)</span> is an <span class="math inline">\(d \times d\)</span> matrix: <span class="math inline">\(V \in \mathbb{R}^{d \times d}\)</span>.</li>
+<li>Columns of <span class="math inline">\(V\)</span> are orthonormal, so the rows of <span class="math inline">\(V^T\)</span> are orthonormal.</li>
+<li>Columns of <span class="math inline">\(V\)</span> are called the <strong>right singular vectors</strong>, and similarly to <span class="math inline">\(U\)</span>, are <strong>eigenvectors</strong> of <span class="math inline">\(X^TX\)</span>.</li>
+<li><span class="math inline">\(VV^T = V^TV = I_d\)</span></li>
+<li>We can think of <span class="math inline">\(V\)</span> as a rotation.</li>
+</ul>
+<center>
+<img src="images/v.png" width="300vw">
+</center>
+<!-- ### SVD: Geometric Perspective
+
+<center><img src = "images/svd.png" width="500vw"></center>
+<br>
+We’ve seen that $U$ and $V$ represent rotations, and $S$ represents scaling. Therefore, SVD says that any matrix can be decomposed into a rotation, then a scaling, and another rotation.
+<center><img src = "images/svd_geo.png" width="600vw"></center> -->
+</section>
+<section id="svd-in-numpy" class="level3" data-number="25.2.4">
+<h3 data-number="25.2.4" class="anchored" data-anchor-id="svd-in-numpy"><span class="header-section-number">25.2.4</span> SVD in <code>NumPy</code></h3>
+<p>For this demo, we’ll work with a rectangular dataset containing <span class="math inline">\(n=100\)</span> rows and <span class="math inline">\(d=4\)</span> columns.</p>
+<div id="0605fda0" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">23</span>)  <span class="co"># kallisti</span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a>plt.rcParams[<span class="st">"figure.figsize"</span>] <span class="op">=</span> (<span class="dv">4</span>, <span class="dv">4</span>)</span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a>plt.rcParams[<span class="st">"figure.dpi"</span>] <span class="op">=</span> <span class="dv">150</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>sns.<span class="bu">set</span>()</span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a>rectangle <span class="op">=</span> pd.read_csv(<span class="st">"data/rectangle_data.csv"</span>)</span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a>rectangle.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="1">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">width</th>
+<th data-quarto-table-cell-role="th">height</th>
+<th data-quarto-table-cell-role="th">area</th>
+<th data-quarto-table-cell-role="th">perimeter</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>8</td>
+<td>6</td>
+<td>48</td>
+<td>28</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>2</td>
+<td>4</td>
+<td>8</td>
+<td>12</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1</td>
+<td>3</td>
+<td>3</td>
+<td>8</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>9</td>
+<td>3</td>
+<td>27</td>
+<td>24</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>9</td>
+<td>8</td>
+<td>72</td>
+<td>34</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>In <code>NumPy</code>, the SVD decomposition function can be called with <code>np.linalg.svd</code> (<a href="https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html">documentation</a>). There are multiple versions of SVD; to get the version that we will follow, we need to set the <code>full_matrices</code> parameter to <code>False</code>.</p>
+<div id="698a8747" class="cell" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>U, S, Vt <span class="op">=</span> np.linalg.svd(rectangle, full_matrices<span class="op">=</span><span class="va">False</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>First, let’s examine <code>U</code>. As we can see, it’s dimensions are <span class="math inline">\(n \times d\)</span>.</p>
+<div id="28d0016e" class="cell" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>U.shape</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<pre><code>(100, 4)</code></pre>
+</div>
+</div>
+<p>The first 5 rows of <code>U</code> are shown below.</p>
+<div id="a3d64930" class="cell" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>pd.DataFrame(U).head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+<th data-quarto-table-cell-role="th">1</th>
+<th data-quarto-table-cell-role="th">2</th>
+<th data-quarto-table-cell-role="th">3</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>-0.155151</td>
+<td>0.064830</td>
+<td>-0.029935</td>
+<td>0.934418</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>-0.038370</td>
+<td>-0.089155</td>
+<td>0.062019</td>
+<td>-0.299462</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>-0.020357</td>
+<td>-0.081138</td>
+<td>0.058997</td>
+<td>0.006852</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>-0.101519</td>
+<td>-0.076203</td>
+<td>-0.148160</td>
+<td>-0.011848</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>-0.218973</td>
+<td>0.206423</td>
+<td>0.007274</td>
+<td>-0.056580</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p><span class="math inline">\(S\)</span> is a little different in <code>NumPy</code>. Since the only useful values in the diagonal matrix <span class="math inline">\(S\)</span> are the singular values on the diagonal axis, only those values are returned and they are stored in an array.</p>
+<p>Our <code>rectangle_data</code> has a rank of <span class="math inline">\(3\)</span>, so we should have 3 non-zero singular values, <strong>sorted from largest to smallest</strong>.</p>
+<div id="b7f05960" class="cell" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>S</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<pre><code>array([3.62932568e+02, 6.29904732e+01, 2.56544651e+01, 2.56364534e-14])</code></pre>
+</div>
+</div>
+<p>It seems like we have 4 non-zero values instead of 3, but notice that the last value is so small (<span class="math inline">\(10^{-15}\)</span>) that it’s practically <span class="math inline">\(0\)</span>. Hence, we can round the values to get 3 singular values.</p>
+<div id="eabc4d7d" class="cell" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>np.<span class="bu">round</span>(S)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<pre><code>array([363.,  63.,  26.,   0.])</code></pre>
+</div>
+</div>
+<p>To get <code>S</code> in matrix format, we use <code>np.diag</code>.</p>
+<div id="b83b27ea" class="cell" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>Sm <span class="op">=</span> np.diag(S)</span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>Sm</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<pre><code>array([[3.62932568e+02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+       [0.00000000e+00, 6.29904732e+01, 0.00000000e+00, 0.00000000e+00],
+       [0.00000000e+00, 0.00000000e+00, 2.56544651e+01, 0.00000000e+00],
+       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.56364534e-14]])</code></pre>
+</div>
+</div>
+<p>Finally, we can see that <code>Vt</code> is indeed a <span class="math inline">\(d \times d\)</span> matrix.</p>
+<div id="f0d153e3" class="cell" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>Vt.shape</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<pre><code>(4, 4)</code></pre>
+</div>
+</div>
+<div id="10a0969d" class="cell" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>pd.DataFrame(Vt)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+<th data-quarto-table-cell-role="th">1</th>
+<th data-quarto-table-cell-role="th">2</th>
+<th data-quarto-table-cell-role="th">3</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>-0.146436</td>
+<td>-0.129942</td>
+<td>-8.100201e-01</td>
+<td>-0.552756</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>-0.192736</td>
+<td>-0.189128</td>
+<td>5.863482e-01</td>
+<td>-0.763727</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>-0.704957</td>
+<td>0.709155</td>
+<td>7.951614e-03</td>
+<td>0.008396</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>-0.666667</td>
+<td>-0.666667</td>
+<td>9.775109e-17</td>
+<td>0.333333</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>To check that this SVD is a valid decomposition, we can reverse it and see if it matches our original table (it does, yay!).</p>
+<div id="1c8a0729" class="cell" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>pd.DataFrame(U <span class="op">@</span> Sm <span class="op">@</span> Vt).head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="10">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+<th data-quarto-table-cell-role="th">1</th>
+<th data-quarto-table-cell-role="th">2</th>
+<th data-quarto-table-cell-role="th">3</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>8.0</td>
+<td>6.0</td>
+<td>48.0</td>
+<td>28.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>2.0</td>
+<td>4.0</td>
+<td>8.0</td>
+<td>12.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1.0</td>
+<td>3.0</td>
+<td>3.0</td>
+<td>8.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>9.0</td>
+<td>3.0</td>
+<td>27.0</td>
+<td>24.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>9.0</td>
+<td>8.0</td>
+<td>72.0</td>
+<td>34.0</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="pca-with-svd" class="level2" data-number="25.3">
+<h2 data-number="25.3" class="anchored" data-anchor-id="pca-with-svd"><span class="header-section-number">25.3</span> PCA with SVD</h2>
+<p>Principal Component Analysis (PCA) and Singular Value Decomposition (SVD) can be easily mixed up, especially when you have to keep track of so many acronyms. Here is a quick summary:</p>
+<ul>
+<li>SVD: a linear algebra algorithm that splits a matrix into 3 component parts.</li>
+<li>PCA: a data science procedure used for dimensionality reduction that <em>uses</em> SVD as one of the steps.</li>
+</ul>
+<!-- ### Derivation
+::: {.callout-tip}
+### [Linear Algebra Review] Covariance Matrix
+[TO DO if time]
+
+::: -->
+<section id="deriving-principal-components-from-svd" class="level3" data-number="25.3.1">
+<h3 data-number="25.3.1" class="anchored" data-anchor-id="deriving-principal-components-from-svd"><span class="header-section-number">25.3.1</span> Deriving Principal Components From SVD</h3>
+<p>After centering the original data matrix <span class="math inline">\(X\)</span> so that each column has a mean of 0, we find its SVD: <span class="math display">\[ X = U S V^T \]</span></p>
+<p>Because <span class="math inline">\(X\)</span> is centered, the covariance matrix of <span class="math inline">\(X\)</span>, <span class="math inline">\(\Sigma\)</span>, is equal to <span class="math inline">\(X^T X\)</span>. Rearranging this equation, we get</p>
+<p><span class="math display">\[
+\begin{align}
+\Sigma &amp;= X^T X \\
+&amp;= (U S V^T)^T U S V^T \\
+&amp;= V S^T U^T U S V^T &amp; \text{U is orthonormal, so $U^T U = I$} \\
+&amp;= V S^2 V^T
+\end{align}
+\]</span></p>
+<p>Multiplying both sides by <span class="math inline">\(V\)</span>, we get</p>
+<p><span class="math display">\[
+\begin{align}
+\Sigma V &amp;= VS^2 V^T V \\
+&amp;= V S^2
+\end{align}
+\]</span></p>
+<p>This shows that the columns of <span class="math inline">\(V\)</span> are the <strong>eigenvectors</strong> of the covariance matrix <span class="math inline">\(\Sigma\)</span> and, therefore, the <strong>principal components</strong>. Additionally, the squared singular values <span class="math inline">\(S^2\)</span> are the <strong>eigenvalues</strong> of <span class="math inline">\(\Sigma\)</span>.</p>
+<!-- <center><img src="images/slide15.png" alt='slide15' width='500'></center> -->
+<p>We’ve now shown that the first <span class="math inline">\(k\)</span> columns of <span class="math inline">\(V\)</span> (equivalently, the first <span class="math inline">\(k\)</span> rows of <span class="math inline">\(V^{T}\)</span>) are the first k principal components of <span class="math inline">\(X\)</span>. We can use them to construct the <strong>latent vector representation</strong> of <span class="math inline">\(X\)</span>, <span class="math inline">\(Z\)</span>, by projecting <span class="math inline">\(X\)</span> onto the principal components.</p>
+<!-- TODO if we have time: add lin alg review for projection  -->
+<center>
+<img src="images/Z.png" alt="slide16" width="500">
+</center>
+<p>We can then instead compute <span class="math inline">\(Z\)</span> as follows:</p>
+<p><span class="math display">\[
+\begin{align}
+Z &amp;= X V \\
+&amp;= USV^T V \\
+&amp;= U S
+\end{align}
+\]</span></p>
+<p><span class="math display">\[Z = XV = US\]</span></p>
+<p>In other words, we can construct <span class="math inline">\(X\)</span>’s’ latent vector representation <span class="math inline">\(Z\)</span> through:</p>
+<ol type="1">
+<li>Projecting <span class="math inline">\(X\)</span> onto the first <span class="math inline">\(k\)</span> columns of <span class="math inline">\(V\)</span>, <span class="math inline">\(V[:, :k]\)</span></li>
+<li>Multiplying the first <span class="math inline">\(k\)</span> columns of U and the first <span class="math inline">\(k\)</span> rows of S</li>
+</ol>
+<p>Using <span class="math inline">\(Z\)</span>, we can approximately recover the centered <span class="math inline">\(X\)</span> matrix by multiplying <span class="math inline">\(Z\)</span> by <span class="math inline">\(V^T\)</span>: <span class="math display">\[ Z V^T = XV V^T = USV^T = X\]</span></p>
+<p>Note that to recover the original (uncentered) <span class="math inline">\(X\)</span> matrix, we would also need to add back the mean.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+[Summary] Terminology
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p><strong>Note</strong>: The notation used for PCA this semester differs from previous semesters a bit. Please bay careful attention to the terminology presented in this note.</p>
+<p>To summarize the terminology and concepts we’ve covered so far:</p>
+<ol type="1">
+<li><strong>Principal Component</strong>: The columns of <span class="math inline">\(V\)</span> . These vectors specify the principal coordinate system and represent the directions along which the most variance in the data is captured.</li>
+<li><strong>Latent Vector Representation</strong> of <span class="math inline">\(X\)</span>: The projection of our data matrix <span class="math inline">\(X\)</span> onto the principal components, <span class="math inline">\(Z = XV = US\)</span>. In previous semesters, the terminology was different and this was termed the principal components of <span class="math inline">\(X\)</span>. In other classes, the term principal coordinate is also used. The <span class="math inline">\(i\)</span>-th latent vector refers to the <span class="math inline">\(i\)</span>-th column of <span class="math inline">\(V\)</span>, corresponding to the <span class="math inline">\(i\)</span>-th largest singular value of <span class="math inline">\(X\)</span>.</li>
+<li><strong><span class="math inline">\(S\)</span></strong> (as in SVD): The diagonal matrix containing all the singular values of <span class="math inline">\(X\)</span>.</li>
+<li><strong><span class="math inline">\(\Sigma\)</span></strong>: The covariance matrix of <span class="math inline">\(X\)</span>. Assuming <span class="math inline">\(X\)</span> is centered, <span class="math inline">\(\Sigma = X^T X\)</span>. In previous semesters, the singular value decomposition of <span class="math inline">\(X\)</span> was written out as <span class="math inline">\(X = U{\Sigma}V^T\)</span>. Note the difference between <span class="math inline">\(\Sigma\)</span> in that context compared to this semester.</li>
+</ol>
+</div>
+</div>
+</section>
+<section id="pca-visualization" class="level3" data-number="25.3.2">
+<h3 data-number="25.3.2" class="anchored" data-anchor-id="pca-visualization"><span class="header-section-number">25.3.2</span> PCA Visualization</h3>
+<center>
+<img src="images/rotate_center_plot.png" alt="slide17" width="750">
+</center>
+<p>As we discussed above, when conducting PCA, we first center the data matrix <span class="math inline">\(X\)</span> and then rotate it such that the direction with the most variation (e.g., the direction that is most spread out) aligns with the x-axis.</p>
+<center>
+<img src="images/slide16.png" alt="slide16" width="500">
+</center>
+<p>In particular, the elements of each column of <span class="math inline">\(V\)</span> (row of <span class="math inline">\(V^{T}\)</span>) rotate the original feature vectors, projecting <span class="math inline">\(X\)</span> onto the principal components.</p>
+<p>The first column of <span class="math inline">\(V\)</span> indicates how each feature contributes (e.g.&nbsp;positive, negative, etc.) to principal component 1; it essentially assigns “weights” to each feature.</p>
+<p>Coupled together, this interpretation also allows us to understand that:</p>
+<ul>
+<li>The principal components are all <strong>orthogonal</strong> to each other because the columns of <span class="math inline">\(V\)</span> are orthonormal.</li>
+<li>Principal components are <strong>axis-aligned</strong>. That is, if you plot two PCs on a 2D plane, one will lie on the x-axis and the other on the y-axis.</li>
+<li>Principal components are <strong>linear combinations</strong> of columns in our data <span class="math inline">\(X\)</span>.</li>
+</ul>
+</section>
+<section id="using-principal-components" class="level3" data-number="25.3.3">
+<h3 data-number="25.3.3" class="anchored" data-anchor-id="using-principal-components"><span class="header-section-number">25.3.3</span> Using Principal Components</h3>
+<p>Let’s summarize the steps to obtain Principal Components via SVD:</p>
+<ol type="1">
+<li><p>Center the data matrix <span class="math inline">\(X\)</span> by subtracting the mean of each attribute column.</p></li>
+<li><p>To find the <span class="math inline">\(k\)</span> <strong>principal components</strong>:</p>
+<ol type="1">
+<li>Compute the SVD of the data matrix (<span class="math inline">\(X = U{S}V^{T}\)</span>).</li>
+<li>The first <span class="math inline">\(k\)</span> columns of <span class="math inline">\(V\)</span> contain the <span class="math inline">\(k\)</span> <strong>principal components</strong> of <span class="math inline">\(X\)</span>. The <span class="math inline">\(k\)</span>-th column of <span class="math inline">\(V\)</span> is also known as the <span class="math inline">\(k\)</span>-th latent vector and corresponds to the <span class="math inline">\(k\)</span>-th largest singular value of <span class="math inline">\(X\)</span>.</li>
+</ol></li>
+</ol>
+</section>
+<section id="code-demo" class="level3" data-number="25.3.4">
+<h3 data-number="25.3.4" class="anchored" data-anchor-id="code-demo"><span class="header-section-number">25.3.4</span> Code Demo</h3>
+<p>Let’s now walk through an example where we compute PCA using SVD. In order to get the first <span class="math inline">\(k\)</span> principal components from an <span class="math inline">\(n \times d\)</span> matrix <span class="math inline">\(X\)</span>, we:</p>
+<ol type="1">
+<li>Center <span class="math inline">\(X\)</span> by subtracting the mean from each column. Notice how we specify <code>axis=0</code> so that the mean is computed per column.</li>
+</ol>
+<div id="9c941629" class="cell" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>centered_df <span class="op">=</span> rectangle <span class="op">-</span> np.mean(rectangle, axis<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>centered_df.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="11">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">width</th>
+<th data-quarto-table-cell-role="th">height</th>
+<th data-quarto-table-cell-role="th">area</th>
+<th data-quarto-table-cell-role="th">perimeter</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>2.97</td>
+<td>1.35</td>
+<td>24.78</td>
+<td>8.64</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>-3.03</td>
+<td>-0.65</td>
+<td>-15.22</td>
+<td>-7.36</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>-4.03</td>
+<td>-1.65</td>
+<td>-20.22</td>
+<td>-11.36</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>3.97</td>
+<td>-1.65</td>
+<td>3.78</td>
+<td>4.64</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>3.97</td>
+<td>3.35</td>
+<td>48.78</td>
+<td>14.64</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<ol start="2" type="1">
+<li>Get the Singular Value Decomposition of the centered <span class="math inline">\(X\)</span>: <span class="math inline">\(U\)</span>, <span class="math inline">\(S\)</span> and <span class="math inline">\(V^T\)</span></li>
+</ol>
+<div id="dea65c1c" class="cell" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>U, S, Vt <span class="op">=</span> np.linalg.svd(centered_df, full_matrices<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>Sm <span class="op">=</span> pd.DataFrame(np.diag(np.<span class="bu">round</span>(S, <span class="dv">1</span>)))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<ol start="3" type="1">
+<li>Take the first <span class="math inline">\(k\)</span> columns of <span class="math inline">\(V\)</span>. These are the first <span class="math inline">\(k\)</span> principal components of <span class="math inline">\(X\)</span>.</li>
+</ol>
+<div id="12087fd4" class="cell" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>two_PCs <span class="op">=</span> Vt.T[:, :<span class="dv">2</span>]</span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>pd.DataFrame(two_PCs).head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="13">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+<th data-quarto-table-cell-role="th">1</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>-0.098631</td>
+<td>0.668460</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>-0.072956</td>
+<td>-0.374186</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>-0.931226</td>
+<td>-0.258375</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>-0.343173</td>
+<td>0.588548</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="data-variance-and-centering" class="level2" data-number="25.4">
+<h2 data-number="25.4" class="anchored" data-anchor-id="data-variance-and-centering"><span class="header-section-number">25.4</span> Data Variance and Centering</h2>
+<p>We define the total variance of a data matrix as the sum of variances of attributes. The principal components are a low-dimension representation that capture as much of the original data’s total variance as possible. Formally, the <span class="math inline">\(i\)</span>-th singular value tells us the <strong>component score</strong>, or how much of the data variance is captured by the <span class="math inline">\(i\)</span>-th principal component. Assuming the number of datapoints is <span class="math inline">\(n\)</span>:</p>
+<p><span class="math display">\[\text{i-th component score} = \frac{(\text{i-th singular value}^2)}{n}\]</span></p>
+<p>Summing up the component scores is equivalent to computing the total variance <em>if we center our data</em>.</p>
+<p><strong>Data Centering</strong>: PCA has a data-centering step that precedes any singular value decomposition, where, if implemented, the component score is defined as above.</p>
+<p>If you want to dive deeper into PCA, <a href="https://www.youtube.com/playlist?list=PLMrJAkhIeNNSVjnsviglFoY2nXildDCcv">Steve Brunton’s SVD Video Series</a> is a great resource.</p>
+</section>
+<section id="interpreting-pca" class="level2" data-number="25.5">
+<h2 data-number="25.5" class="anchored" data-anchor-id="interpreting-pca"><span class="header-section-number">25.5</span> Interpreting PCA</h2>
+<section id="pca-plot" class="level3" data-number="25.5.1">
+<h3 data-number="25.5.1" class="anchored" data-anchor-id="pca-plot"><span class="header-section-number">25.5.1</span> PCA Plot</h3>
+<p>We often plot the first two principal components using a scatter plot, with PC1 on the <span class="math inline">\(x\)</span>-axis and PC2 on the <span class="math inline">\(y\)</span>-axis. This is often called a PCA plot.</p>
+<p>If the first two singular values are large and all others are small, then two dimensions are enough to describe most of what distinguishes one observation from another. If not, a PCA plot omits a lot of information.</p>
+<p>PCA plots help us assess similarities between our data points and if there are any clusters in our dataset. In the case study before, for example, we could create the following PCA plot:</p>
+<center>
+<img src="images/pca_plot.png" alt="pca_plot" width="500">
+</center>
+</section>
+<section id="scree-plots" class="level3" data-number="25.5.2">
+<h3 data-number="25.5.2" class="anchored" data-anchor-id="scree-plots"><span class="header-section-number">25.5.2</span> Scree Plots</h3>
+<p>A scree plot shows the <strong>variance ratio</strong> captured by each principal component, with the largest variance ratio first. They help us visually determine the number of dimensions needed to describe the data reasonably. The singular values that fall in the region of the plot after a large drop-off correspond to principal components that are <strong>not</strong> needed to describe the data since they explain a relatively low proportion of the total variance of the data. This point where adding more principal components results in diminishing returns is called the “elbow” and is the point just before the line flattens out. Using this “elbow method”, we can see that the elbow is at the second principal component.</p>
+<center>
+<img src="images/scree_plot.png" alt="scree_plot" width="500">
+</center>
+</section>
+<section id="biplots" class="level3" data-number="25.5.3">
+<h3 data-number="25.5.3" class="anchored" data-anchor-id="biplots"><span class="header-section-number">25.5.3</span> Biplots</h3>
+<p>Biplots superimpose the directions onto the plot of PC1 vs.&nbsp;PC2, where vector <span class="math inline">\(j\)</span> corresponds to the direction for feature <span class="math inline">\(j\)</span> (e.g., <span class="math inline">\(v_{1j}, v_{2j}\)</span>). There are several ways to scale biplot vectors —— in this course, we plot the direction itself. For other scalings, which can lead to more interpretable directions/loadings, see <a href="https://blogs.sas.com/content/iml/2019/11/06/what-are-biplots.html">SAS biplots</a>.</p>
+<p>Through biplots, we can interpret how features correlate with the principal components shown: positively, negatively, or not much at all.</p>
+<center>
+<img src="images/slide17_2.png" alt="slide17_2" width="500">
+</center>
+<p>The directions of the arrow are (<span class="math inline">\(v_1\)</span>, <span class="math inline">\(v_2\)</span>), where <span class="math inline">\(v_1\)</span> and <span class="math inline">\(v_2\)</span> are how that specific feature column contributes to PC1 and PC2, respectively. <span class="math inline">\(v_1\)</span> and <span class="math inline">\(v_2\)</span> are elements of the first and second columns of <span class="math inline">\(V\)</span>, respectively (i.e., the first two rows of <span class="math inline">\(V^T\)</span>).</p>
+<p>Say we were considering feature 3, and say that was the purple arrow labeled “520” here (pointing bottom right).</p>
+<ul>
+<li><span class="math inline">\(v_1\)</span> and <span class="math inline">\(v_2\)</span> are the third elements of the respective columns in <span class="math inline">\(V\)</span>. They are scale feature 3’s column vector in the linear transformation to PC1 and PC2, respectively.</li>
+<li>Here, we would infer that <span class="math inline">\(v_1\)</span> (in the <span class="math inline">\(x\)</span>/PC1-direction) is positive, meaning that a linear increase in feature 3 would correspond to a linear increase of PC1, meaning feature 3 and PC1 are positively correlated.</li>
+<li><span class="math inline">\(v_2\)</span> (in the <span class="math inline">\(y\)</span>/pc2-direction) is negative, meaning a linear increase in feature 3 would correspond to a linear decrease in PC2, meaning feature 3 and PC2 are negatively correlated.</li>
+</ul>
+</section>
+</section>
+<section id="example-1-house-of-representatives-voting" class="level2" data-number="25.6">
+<h2 data-number="25.6" class="anchored" data-anchor-id="example-1-house-of-representatives-voting"><span class="header-section-number">25.6</span> Example 1: House of Representatives Voting</h2>
+<p>Let’s examine how the House of Representatives (of the 116th Congress, 1st session) voted in the month of September 2019.</p>
+<p>Specifically, we’ll look at the records of Roll call votes. From the U.S. Senate (<a href="https://www.senate.gov/reference/Index/Votes.htm">link</a>): roll call votes occur when a representative or senator votes “yea” or “nay” so that the names of members voting on each side are recorded. A voice vote is a vote in which those in favor or against a measure say “yea” or “nay,” respectively, without the names or tallies of members voting on each side being recorded.</p>
+<div id="bbe61878" class="cell" data-execution_count="14">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> yaml</span>
+<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> datetime <span class="im">import</span> datetime</span>
+<span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.express <span class="im">as</span> px</span>
+<span id="cb19-8"><a href="#cb19-8" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.graph_objects <span class="im">as</span> go</span>
+<span id="cb19-9"><a href="#cb19-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb19-10"><a href="#cb19-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb19-11"><a href="#cb19-11" aria-hidden="true" tabindex="-1"></a>votes <span class="op">=</span> pd.read_csv(<span class="st">"data/votes.csv"</span>)</span>
+<span id="cb19-12"><a href="#cb19-12" aria-hidden="true" tabindex="-1"></a>votes <span class="op">=</span> votes.astype({<span class="st">"roll call"</span>: <span class="bu">str</span>})</span>
+<span id="cb19-13"><a href="#cb19-13" aria-hidden="true" tabindex="-1"></a>votes.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">chamber</th>
+<th data-quarto-table-cell-role="th">session</th>
+<th data-quarto-table-cell-role="th">roll call</th>
+<th data-quarto-table-cell-role="th">member</th>
+<th data-quarto-table-cell-role="th">vote</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>House</td>
+<td>1</td>
+<td>555</td>
+<td>A000374</td>
+<td>Not Voting</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>House</td>
+<td>1</td>
+<td>555</td>
+<td>A000370</td>
+<td>Yes</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>House</td>
+<td>1</td>
+<td>555</td>
+<td>A000055</td>
+<td>No</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>House</td>
+<td>1</td>
+<td>555</td>
+<td>A000371</td>
+<td>Yes</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>House</td>
+<td>1</td>
+<td>555</td>
+<td>A000372</td>
+<td>No</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Suppose we pivot this table to group each legislator and their voting pattern across every (roll call) vote in this month. We mark 1 if the legislator voted Yes (“yea”), and 0 otherwise (“No”, “nay”, no vote, speaker, etc.).</p>
+<div id="08ddd165" class="cell" data-execution_count="15">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> was_yes(s):</span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> <span class="dv">1</span> <span class="cf">if</span> s.iloc[<span class="dv">0</span>] <span class="op">==</span> <span class="st">"Yes"</span> <span class="cf">else</span> <span class="dv">0</span></span>
+<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>vote_pivot <span class="op">=</span> votes.pivot_table(</span>
+<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>    index<span class="op">=</span><span class="st">"member"</span>, columns<span class="op">=</span><span class="st">"roll call"</span>, values<span class="op">=</span><span class="st">"vote"</span>, aggfunc<span class="op">=</span>was_yes, fill_value<span class="op">=</span><span class="dv">0</span></span>
+<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(vote_pivot.shape)</span>
+<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a>vote_pivot.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>(441, 41)</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="15">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">roll call</th>
+<th data-quarto-table-cell-role="th">515</th>
+<th data-quarto-table-cell-role="th">516</th>
+<th data-quarto-table-cell-role="th">517</th>
+<th data-quarto-table-cell-role="th">518</th>
+<th data-quarto-table-cell-role="th">519</th>
+<th data-quarto-table-cell-role="th">520</th>
+<th data-quarto-table-cell-role="th">521</th>
+<th data-quarto-table-cell-role="th">522</th>
+<th data-quarto-table-cell-role="th">523</th>
+<th data-quarto-table-cell-role="th">524</th>
+<th data-quarto-table-cell-role="th">...</th>
+<th data-quarto-table-cell-role="th">546</th>
+<th data-quarto-table-cell-role="th">547</th>
+<th data-quarto-table-cell-role="th">548</th>
+<th data-quarto-table-cell-role="th">549</th>
+<th data-quarto-table-cell-role="th">550</th>
+<th data-quarto-table-cell-role="th">551</th>
+<th data-quarto-table-cell-role="th">552</th>
+<th data-quarto-table-cell-role="th">553</th>
+<th data-quarto-table-cell-role="th">554</th>
+<th data-quarto-table-cell-role="th">555</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">member</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">A000055</td>
+<td>1</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>1</td>
+<td>1</td>
+<td>0</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>...</td>
+<td>0</td>
+<td>0</td>
+<td>1</td>
+<td>0</td>
+<td>0</td>
+<td>1</td>
+<td>0</td>
+<td>0</td>
+<td>1</td>
+<td>0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">A000367</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>...</td>
+<td>0</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>0</td>
+<td>1</td>
+<td>1</td>
+<td>0</td>
+<td>1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">A000369</td>
+<td>1</td>
+<td>1</td>
+<td>0</td>
+<td>0</td>
+<td>1</td>
+<td>1</td>
+<td>0</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>...</td>
+<td>0</td>
+<td>0</td>
+<td>1</td>
+<td>0</td>
+<td>0</td>
+<td>1</td>
+<td>0</td>
+<td>0</td>
+<td>1</td>
+<td>0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">A000370</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>0</td>
+<td>1</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>...</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>0</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">A000371</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>0</td>
+<td>1</td>
+<td>0</td>
+<td>0</td>
+<td>0</td>
+<td>...</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>0</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+
+<p>5 rows × 41 columns</p>
+</div>
+</div>
+</div>
+<p><strong>Do legislators’ roll call votes show a relationship with their political party?</strong></p>
+<section id="pca-with-svd-1" class="level3" data-number="25.6.1">
+<h3 data-number="25.6.1" class="anchored" data-anchor-id="pca-with-svd-1"><span class="header-section-number">25.6.1</span> PCA with SVD</h3>
+<p>While we could consider loading information about the legislator, such as their party, and see how this relates to their voting pattern, it turns out that we can do a lot with PCA to cluster legislators by how they vote. Let’s calculate the principal components using the SVD method.</p>
+<div id="ca32fff1" class="cell" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>vote_pivot_centered <span class="op">=</span> vote_pivot <span class="op">-</span> np.mean(vote_pivot, axis<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>u, s, vt <span class="op">=</span> np.linalg.svd(vote_pivot_centered, full_matrices<span class="op">=</span><span class="va">False</span>) <span class="co"># SVD</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We can use the singular values in <code>s</code> to construct a scree plot:</p>
+<div id="a9c139a9" class="cell" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.line(y<span class="op">=</span>s<span class="op">**</span><span class="dv">2</span> <span class="op">/</span> <span class="bu">sum</span>(s<span class="op">**</span><span class="dv">2</span>), title<span class="op">=</span><span class="st">'Variance Explained'</span>, width<span class="op">=</span><span class="dv">700</span>, height<span class="op">=</span><span class="dv">600</span>, markers<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>fig.update_xaxes(title_text<span class="op">=</span><span class="st">'Principal Component i'</span>)</span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>fig.update_yaxes(title_text<span class="op">=</span><span class="st">'Proportion of Variance Explained'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>                            <div id="7c37dee9-54c8-4c2e-b6f9-dab742d19c76" class="plotly-graph-div" style="height:600px; width:700px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("7c37dee9-54c8-4c2e-b6f9-dab742d19c76")) {                    Plotly.newPlot(                        "7c37dee9-54c8-4c2e-b6f9-dab742d19c76",                        [{"hovertemplate":"x=%{x}\u003cbr\u003ey=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","line":{"color":"#636efa","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines+markers","name":"","orientation":"v","showlegend":false,"x":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40],"xaxis":"x","y":[0.80299948341985,0.05260075829072944,0.02017795283654643,0.014382988306834042,0.012255915783250674,0.01086538740807424,0.009039386564843974,0.008175298289052193,0.006929557842592138,0.0068522701168077585,0.0066114705872734915,0.005575200445771255,0.0048441765069488114,0.004214979173196361,0.0036158440150217707,0.0033149092210741794,0.00307792796110115,0.0028478605528098914,0.0026611112854699245,0.0021936985449850684,0.00207921790990837,0.001957550371177054,0.0018197430253483141,0.0014927520663674165,0.0012704949660453362,0.0011903839270463678,0.001017282542123854,0.0008787011117487037,0.0008334796622795174,0.0007849101135114717,0.0007380524260386688,0.000553828540660478,0.0004779572825520446,0.00037995747643270144,0.0003286617419489118,0.00028955929864822924,0.00024788382730839785,0.00015706379207376328,0.0001327835170737682,9.749227324057157e-05,3.6066976232945986e-05],"yaxis":"y","type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"Principal Component i"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Proportion of Variance Explained"}},"legend":{"tracegroupgap":0},"title":{"text":"Variance Explained"},"height":600,"width":700},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('7c37dee9-54c8-4c2e-b6f9-dab742d19c76');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>It looks like this graph plateaus after the third principal component, so our “elbow” is at PC3, and most of the variance is captured by just the first three principal components. Let’s use these PCs to visualize the latent vector representation of <span class="math inline">\(X\)</span>!</p>
+<div id="1b0de9f2" class="cell" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Calculate the latent vector representation (US or XV)</span></span>
+<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a><span class="co"># using the first 3 principal components</span></span>
+<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>vote_2d <span class="op">=</span> pd.DataFrame(index<span class="op">=</span>vote_pivot_centered.index)</span>
+<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>vote_2d[[<span class="st">"z1"</span>, <span class="st">"z2"</span>, <span class="st">"z3"</span>]] <span class="op">=</span> (u <span class="op">*</span> s)[:, :<span class="dv">3</span>]</span>
+<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb24-6"><a href="#cb24-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the latent vector representation</span></span>
+<span id="cb24-7"><a href="#cb24-7" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.scatter_3d(vote_2d, x<span class="op">=</span><span class="st">'z1'</span>, y<span class="op">=</span><span class="st">'z2'</span>, z<span class="op">=</span><span class="st">'z3'</span>, title<span class="op">=</span><span class="st">'Vote Data'</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>)</span>
+<span id="cb24-8"><a href="#cb24-8" aria-hidden="true" tabindex="-1"></a>fig.update_traces(marker<span class="op">=</span><span class="bu">dict</span>(size<span class="op">=</span><span class="dv">5</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>                            <div id="61584a60-7e10-4a16-8027-47cae7419549" class="plotly-graph-div" style="height:600px; width:800px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("61584a60-7e10-4a16-8027-47cae7419549")) {                    Plotly.newPlot(                        "61584a60-7e10-4a16-8027-47cae7419549",                        [{"hovertemplate":"z1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","marker":{"color":"#636efa","symbol":"circle","size":5},"mode":"markers","name":"","scene":"scene","showlegend":false,"x":[3.0613563028703257,0.18886967448801345,2.8443699733984293,-2.6075356647925942,-2.6075356647926,3.1769957967007496,1.2471338359441178,3.0054862520941086,-2.607535664792598,2.8061353623326846,-2.28318020917608,-2.6075356647926005,-2.4672631733369053,3.0613563028703217,3.0636232138278903,2.9577425562712567,-2.5828873434009125,2.3980980313982956,2.1097407962926003,-2.420068669548326,3.1268633923892666,3.063623213827893,-2.607535664792598,-1.767151329693134,2.9195079452055324,2.9479837199974717,-2.6075356647925982,-2.607535664792599,-2.607535664792598,3.2304771389883324,3.0266219954879507,-2.5990468120383077,2.693996172185615,-2.6075356647925982,3.0318459882749247,2.540122067289472,3.0733820501016775,-2.607535664792598,3.0266219954879516,3.235701131775308,-2.6075356647925982,-2.607535664792598,3.176995796700743,3.073382050101677,2.9600094672288266,-2.2831802091760798,3.230477138988333,1.6956506084396405,2.9479837199974717,3.176995796700743,0.6996357415898917,-2.607535664792598,0.5317890785221304,-2.6094687771056666,3.0613563028703226,2.8443699733984067,-2.607535664792598,-1.7684854848345133,-2.607535664792598,3.0613563028703226,-1.907327811331569,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.6023116720056225,2.7550604579099702,-2.607535664792598,-2.4296459065512095,2.149917789607238,2.9865755771671645,2.79760991878468,-2.607535664792598,-2.607535664792598,3.0613563028703226,3.1022150709975844,2.8994560303632206,-2.4296459065512095,-2.607535664792598,-2.607535664792598,3.073382050101677,3.2016441180924256,-2.607535664792598,3.230477138988333,-2.4434833484467844,2.9600094672288266,-2.607535664792598,-1.4487390246016114,-2.607535664792598,-2.607535664792598,-2.4275424195049045,-2.4258409517713346,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.582887343400916,1.943249443257102,-2.4653300610238382,2.248288869605445,3.176995796700743,3.1854846494550335,-2.607535664792598,2.6939961721856145,-2.6094687771056666,-2.607535664792598,3.1022150709975844,-2.607535664792598,2.780643522298368,-2.607535664792598,-2.4472325255218945,-2.4250220140802483,-2.607535664792598,-2.607535664792598,2.9230082488888853,-2.607535664792598,-2.607535664792598,3.073382050101677,-1.698856960778816,2.9479837199974717,3.176995796700743,-2.607535664792598,-2.607535664792598,2.9577425562712576,2.8443699733984067,-2.607535664792598,2.803868451375113,0.803302983662706,-2.2831802091760798,-2.0873210566192624,3.20582881759665,3.0613563028703226,3.176995796700743,-2.4695147605280634,3.242961551842744,-2.607535664792598,3.063623213827891,-1.6793063537969186,2.9577425562712576,3.073382050101677,3.230477138988333,2.9230082488888853,-2.045750087813038,-2.6023116720056225,2.7936902399254433,3.063623213827891,1.5813595746140228,2.590594624716232,-1.9650432514916991,2.262006727664982,-2.2831802091760798,2.9479837199974717,-2.607535664792598,-2.607535664792598,-2.265210814860649,2.6939961721856145,3.1022150709975844,3.073382050101677,2.757108396761387,-2.2831802091760798,-2.607535664792598,-2.6094687771056666,-2.607535664792598,-2.060037570438373,3.073382050101677,3.20582881759665,3.0613563028703226,1.9863996030631006,2.4924698055600043,-2.607535664792598,3.176995796700743,-2.607535664792598,3.20582881759665,-2.2489093301477854,3.2016441180924256,2.803868451375113,2.796169433481058,2.463810512400734,2.50947445999873,-2.607535664792598,-2.607535664792598,3.1022150709975844,-2.2831802091760798,-2.607535664792598,-2.2463550551330416,2.9230082488888853,-2.265312627281842,-2.245028768792555,-2.607535664792598,-2.607535664792598,3.20582881759665,2.8443699733984067,-2.607535664792598,2.7064805850400258,-2.607535664792598,2.908822642990693,2.3352724120056854,3.063623213827891,-2.607535664792598,-1.556651357010056,1.8557532484807409,3.0882715352195733,-2.607535664792598,-2.607535664792598,2.9600094672288266,2.4675916971392535,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.0638414044206916,-2.607535664792598,1.5117029242137459,3.0721120665821817,-2.607535664792598,-2.424474630103783,3.0613563028703226,-2.607535664792598,2.9230082488888853,-2.607535664792598,-2.4204137226811886,-2.607535664792598,2.9479837199974717,-2.582887343400916,-2.607535664792598,-2.607535664792598,-2.4335571557963527,-2.607535664792598,-2.607535664792598,3.20582881759665,-2.4672631733369066,3.176995796700743,2.8443699733984067,-2.607535664792598,2.8443699733984067,2.89018055690819,-2.607535664792598,-1.9040302052010922,-2.607535664792598,3.073382050101677,3.063623213827891,-2.607535664792598,-2.4434833484467844,3.176995796700743,-2.607535664792598,-2.4360261201859634,-2.607535664792598,-2.607535664792598,-2.6003633337544443,-2.607535664792598,-2.424474630103783,-2.607535664792598,2.47495513088581,2.7724863541542404,2.3406586644510736,2.462236485303308,-2.607535664792598,-2.607535664792598,2.9865755771671645,-2.607535664792598,3.073382050101677,2.8073687550584667,2.96858001816944,-2.4275372674316738,3.20582881759665,-2.4653300610238382,3.176995796700743,3.0613563028703226,3.1052027025315136,-2.607535664792598,3.025427893209339,1.631892750851903,1.2471338359441173,3.073382050101677,-2.4472325255218945,-2.607535664792598,2.9577425562712576,2.8073687550584667,-2.607535664792598,-2.607535664792598,-2.607535664792598,-1.9286096298806306,1.6956506084396405,-2.607535664792598,-2.607535664792598,1.2471338359441173,-2.607535664792598,2.8568543862528175,-2.607535664792598,2.9577425562712576,2.9462160849769474,-2.607535664792598,3.073382050101677,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.607535664792598,1.2471338359441173,-0.4006544602337373,-2.607535664792598,-2.607535664792598,-2.4653300610238382,3.0384886198975476,3.0613563028703226,-2.607535664792598,3.1022150709975844,-2.607535664792598,-2.6023116720056225,3.023121691804598,1.2471338359441173,-2.607535664792598,-2.607535664792598,2.9479837199974717,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.6094687771056666,2.9479837199974717,-2.607535664792598,-2.607535664792598,2.8443699733984067,-2.582887343400916,-2.069671075498826,2.8443699733984067,2.635657157019789,-2.607535664792598,2.620666958181626,2.9195079452055333,-2.607535664792598,1.2471338359441173,3.032353841222483,-2.4296459065512095,3.0519547127005056,-2.607535664792598,0.3762684898426956,2.223113379165449,2.6939961721856145,3.09803037149336,3.0613563028703226,-2.607535664792598,3.230477138988333,-2.607535664792598,-2.607535664792598,2.675833358678741,-2.607535664792598,-2.607535664792598,2.9132494126151003,-2.4204137226811886,0.7269279742815477,-2.607535664792598,2.8443699733984067,-2.607535664792598,-2.607535664792598,-2.6094687771056666,-2.607535664792598,-2.607535664792598,3.063623213827891,-2.4653300610238382,3.0613563028703226,1.2471338359441173,-2.4472325255218945,3.073382050101677,-2.5058550305066003,2.8515423044365598,3.176995796700743,-2.607535664792598,3.176995796700743,-2.607535664792598,3.176995796700743,1.3584614873640402,2.9600094672288266,-2.607535664792598,-2.607535664792598,1.2471338359441173,-2.5990468120383077,-2.607535664792598,-2.2831802091760798,-2.4434833484467844,-2.2831802091760798,2.8173175909214043,-2.607535664792598,2.8096356660160358,2.540122067289472,2.4636051209172383,-2.607535664792598,-2.607535664792598,-1.5735933606914334,3.176995796700743,-2.607535664792598,2.754897042075992,2.9577425562712576,-2.607535664792598,-2.607535664792598,2.984657788620509,-2.607535664792598,-2.5039219181935333,3.1022150709975844,2.9195079452055333,-2.6094687771056666,-2.607535664792598,-2.6094687771056666,-2.2757229809152606,2.334890179873553,-2.607535664792598,-2.607535664792598,-2.607535664792598,-2.491896170962178,-2.607535664792598,-2.1466662150216855,-1.7476099927765334,-2.607535664792598,2.516437200954115,3.176995796700743,-2.4233675754238653,3.2016441180924256,-2.4653300610238382,3.063623213827891,2.8722182940159993,-2.5806357562097584,2.9577425562712576,2.9479837199974717,3.0613563028703226,2.9479837199974717,3.0554550163838585,3.176995796700743,3.023121691804598,3.0147595451169704,3.176995796700743,-2.607535664792598,2.463810512400734,3.073382050101677,-2.6003633337544443,-2.607535664792598,3.07890923549693,2.6939961721856145,-2.607535664792598,2.4129868799933925,2.6346510137831904],"y":[0.3641912763688668,-2.4335649286627112,0.8216190448691428,0.12797736416006028,0.12797736416005573,0.11638690496512354,-3.5331956163192215,-0.052343690800579115,0.12797736416005975,0.4420143549429429,0.5260633881711366,0.1279773641600578,-0.6637626736896819,0.36419127636886767,0.36253686594701773,0.5754690838872423,-0.16927484071283472,-0.002844504245515361,0.30362674246924537,-0.17445121891593482,-0.2732222457221526,0.3625368659470178,0.12797736416005978,-0.6769055187993718,0.195864393961043,0.6103412373507676,0.12797736416006017,0.12797736416005978,0.12797736416005967,-0.48450005324052264,0.007943267026154012,-0.1316949431078321,0.7131754069301752,0.12797736416005984,-0.21092765093257304,0.5813750884077263,0.32766471248349116,0.12797736416005978,0.007943267026155335,-0.7033709711992527,0.12797736416005984,0.1279773641600598,0.1163869049651194,0.3276647124834917,0.5738146734653897,0.5260633881711361,-0.484500053240523,-2.093912184491118,0.6103412373507668,0.1163869049651194,-3.3941785234671014,0.1279773641600598,-3.099044252622731,-0.11397795937503555,0.3641912763688689,0.8216190448691394,0.1279773641600598,-0.007142166663464784,0.1279773641600598,0.3641912763688689,-0.12182373161445148,0.1279773641600598,0.1279773641600598,0.1279773641600598,0.1279773641600598,0.1279773641600598,0.1279773641600598,0.1279773641600598,-0.09089355379866998,-1.3781934082007834,0.1279773641600598,0.06385973182068715,0.17873737954310867,0.2718343305544934,0.501897599411803,0.1279773641600598,0.1279773641600598,0.3641912763688689,0.024029959150743727,-0.7543191061123304,0.06385973182068715,0.1279773641600598,0.1279773641600598,0.3276647124834917,-0.1808652999077749,0.1279773641600598,-0.484500053240523,0.320725527173436,0.5738146734653897,0.1279773641600598,0.2306844298450329,0.1279773641600598,0.1279773641600598,0.05753993968238752,-0.1661897373709047,0.1279773641600598,0.1279773641600598,0.1279773641600598,0.1279773641600598,-0.16927484071283458,-0.053061755969861485,-0.4218073501545858,-1.6119751316581783,0.1163869049651194,-0.1432854023027721,0.1279773641600598,0.7131754069301753,-0.11397795937503555,0.1279773641600598,0.024029959150743727,0.1279773641600598,0.6652659905753705,0.1279773641600598,0.3333152251577598,0.062465056554713384,0.1279773641600598,0.1279773641600598,0.2192210745445277,0.1279773641600598,0.1279773641600598,0.3276647124834917,-0.42549532958476366,0.6103412373507668,0.1163869049651194,0.1279773641600598,0.1279773641600598,0.5754690838872414,0.8216190448691394,0.1279773641600598,0.4436687653647927,0.11511721955713658,0.5260633881711361,-0.051386307136779226,-0.18724784836762867,0.3641912763688689,0.1163869049651194,-0.4154248016947321,-0.697170766406528,0.1279773641600598,0.36253686594701734,-0.6486495620123252,0.5754690838872414,0.3276647124834917,-0.484500053240523,0.2192210745445277,-0.8898033974800051,-0.09089355379866998,0.38272092087386517,0.36253686594701734,0.2641268498176575,-0.17934886761641966,-0.08520700277516208,-2.6324522666497407,0.5260633881711361,0.6103412373507668,0.1279773641600598,0.1279773641600598,0.012731325202593692,0.7131754069301753,0.024029959150743727,0.3276647124834917,0.12394731990745637,0.5260633881711361,0.1279773641600598,-0.11397795937503555,0.1279773641600598,-0.011039728692061056,0.3276647124834917,-0.18724784836762867,0.3641912763688689,0.7083867405612202,-0.5601840008335238,0.1279773641600598,0.1163869049651194,0.1279773641600598,-0.18724784836762867,-0.019235930310973596,-0.1808652999077749,0.4436687653647927,0.14533106908517562,0.5572603939004072,-1.349023093165011,0.1279773641600598,0.1279773641600598,0.024029959150743727,0.5260633881711361,0.1279773641600598,-0.28647823673264283,0.2192210745445277,0.4776939455345848,-0.007972367922958728,0.1279773641600598,0.1279773641600598,-0.18724784836762867,0.8216190448691394,0.1279773641600598,0.50050469376417,0.1279773641600598,-1.0946180800996952,-0.3889441135456708,0.36253686594701734,0.1279773641600598,-0.9752800438994096,0.4064609120486551,0.06528466107412292,0.1279773641600598,0.1279773641600598,0.5738146734653897,0.1752108202098884,0.1279773641600598,0.1279773641600598,0.1279773641600598,-0.35199054433798904,0.1279773641600598,0.10442315665751208,0.10286455867912583,0.1279773641600598,0.06479316688757228,0.3641912763688689,0.1279773641600598,0.2192210745445277,0.1279773641600598,-0.20847081310119284,0.1279773641600598,0.6103412373507668,-0.16927484071283458,0.1279773641600598,0.1279773641600598,0.28703628670082376,0.1279773641600598,0.1279773641600598,-0.18724784836762867,-0.6637626736896812,0.1163869049651194,0.8216190448691394,0.1279773641600598,0.8216190448691394,-0.04814905335377703,0.1279773641600598,-0.40172427569010866,0.1279773641600598,0.3276647124834917,0.36253686594701734,0.1279773641600598,0.320725527173436,0.1163869049651194,0.1279773641600598,0.29670795992575877,0.1279773641600598,0.1279773641600598,-0.09047806174901223,0.1279773641600598,0.06479316688757228,0.1279773641600598,0.06845550378733016,0.44624238785523007,-0.8946107055677667,0.3749139199808099,0.1279773641600598,0.1279773641600598,0.2718343305544934,0.1279773641600598,0.3276647124834917,0.4670254459482776,-0.8687117767505306,0.037035652657867194,-0.18724784836762867,-0.4218073501545858,0.1163869049651194,0.3641912763688689,-0.18804291829847425,0.1279773641600598,-0.2890319712142623,-0.6288307895514365,-3.5331956163192215,0.3276647124834917,0.3333152251577598,0.1279773641600598,0.5754690838872414,0.4670254459482776,0.1279773641600598,0.1279773641600598,0.1279773641600598,0.6316097866643477,-2.093912184491118,0.1279773641600598,0.1279773641600598,-3.5331956163192215,0.1279773641600598,0.6089483317031339,0.1279773641600598,0.5754690838872414,-0.434597660654994,0.1279773641600598,0.3276647124834917,0.1279773641600598,0.1279773641600598,0.1279773641600598,0.1279773641600598,0.1279773641600598,-3.5331956163192215,0.3334664505227086,0.1279773641600598,0.1279773641600598,-0.4218073501545858,-0.13232309514302565,0.3641912763688689,0.1279773641600598,0.024029959150743727,0.1279773641600598,-0.09089355379866998,-0.015413413557329456,-3.5331956163192215,0.1279773641600598,0.1279773641600598,0.6103412373507668,0.1279773641600598,0.1279773641600598,0.1279773641600598,-0.11397795937503555,0.6103412373507668,0.1279773641600598,0.1279773641600598,0.8216190448691394,-0.16927484071283458,-1.3444347996058543,0.8216190448691394,0.3063121129856619,0.1279773641600598,0.06785499952752398,0.19586439396104297,0.1279773641600598,-3.5331956163192215,-0.5670786892095435,0.06385973182068715,-0.3190481668900774,0.1279773641600598,-0.7143837292589638,0.05747678148734914,0.7131754069301753,0.030412507610597328,0.3641912763688689,0.1279773641600598,-0.484500053240523,0.1279773641600598,0.1279773641600598,-0.09575197491235556,0.1279773641600598,0.1279773641600598,0.25409322800805323,-0.20847081310119284,-0.7120672458749473,0.1279773641600598,0.8216190448691394,0.1279773641600598,0.1279773641600598,-0.11397795937503555,0.1279773641600598,0.1279773641600598,0.36253686594701734,-0.4218073501545858,0.3641912763688689,-3.5331956163192215,0.3333152251577598,0.3276647124834917,-0.3252557668934078,0.6031636189600678,0.1163869049651194,0.1279773641600598,0.1163869049651194,0.1279773641600598,0.1163869049651194,-0.12871141587600946,0.5738146734653897,0.1279773641600598,0.1279773641600598,-3.5331956163192215,-0.13169494310783167,0.1279773641600598,0.5260633881711361,0.320725527173436,0.5260633881711361,0.3314872218170987,0.1279773641600598,0.46537103552642567,0.5813750884077264,-0.6402580263365341,0.1279773641600598,0.1279773641600598,-0.9615074345101164,0.1163869049651194,0.1279773641600598,-0.3837580030503119,0.5754690838872414,0.1279773641600598,0.1279773641600598,0.2765624685924953,0.1279773641600598,-0.08330044335831259,0.024029959150743727,0.19586439396104297,-0.11397795937503555,0.1279773641600598,-0.11397795937503555,0.5020458209234586,0.8131402173462926,0.1279773641600598,0.1279773641600598,0.1279773641600598,-0.11982700724368996,0.1279773641600598,-0.2388693928076657,0.5839676089633364,0.1279773641600598,0.7678261499781402,0.1163869049651194,0.0801547353756334,-0.1808652999077749,-0.4218073501545858,0.36253686594701734,-0.7614644403528912,-0.41761271270778366,0.5754690838872414,0.6103412373507668,0.3641912763688689,0.6103412373507668,-0.29569148630659275,0.1163869049651194,-0.015413413557329456,-0.33090627956767593,0.1163869049651194,0.1279773641600598,0.5572603939004072,0.3276647124834917,-0.09047806174901223,0.1279773641600598,-0.8899189294199046,0.7131754069301753,0.1279773641600598,-0.8377226241340164,0.5256923212592882],"z":[0.19449328737921665,0.2832796782597819,-0.4761262801351033,0.031377041059333984,0.031377041059333484,0.47944000323511804,-0.6281673439011543,0.5093243292687083,0.03137704105933393,-0.43791384753409357,-0.0985691176438642,0.03137704105933388,0.7776083701680025,0.194493287379218,-0.08318656366680528,0.08650028676681529,0.08451680465613104,-0.6019287162767578,-1.2556732235908046,-0.1396192611791623,0.7011766138046377,-0.08318656366680499,0.031377041059334206,-0.1688580409372806,0.12471271936782692,-0.36813327952270186,0.03137704105933415,0.031377041059334594,0.031377041059334136,0.8091696144170413,0.27511355022177203,-0.07528203418722607,-0.6804527331484469,0.03137704105933441,0.22062476835251887,-0.9271870164033321,0.37144700262271163,0.03137704105933426,0.2751135502217718,0.7546808325477862,0.03137704105933433,0.031377041059334296,0.4794400032351147,0.3714470026227119,-0.1911795642792075,-0.09856911764386461,0.8091696144170412,-0.17926712569805908,-0.3681332795227013,0.4794400032351147,-0.2762472923126361,0.031377041059334296,-0.639842625426913,-0.0616080443190472,0.19449328737921784,-0.4761262801351043,0.031377041059334296,1.2019975488606187,0.031377041059334296,0.19449328737921784,0.2098708431543407,0.031377041059334296,0.031377041059334296,0.031377041059334296,0.031377041059334296,0.031377041059334296,0.031377041059334296,0.031377041059334296,-0.023111740809920317,0.1766646340071986,0.031377041059334296,0.03716781698889269,-0.8077800205303592,0.3630901343519441,-0.572459732536044,0.031377041059334296,0.031377041059334296,0.19449328737921784,0.648036850207841,0.2664477000990305,0.03716781698889269,0.031377041059334296,0.031377041059334296,0.3714470026227119,0.5325797668319123,0.031377041059334296,0.8091696144170412,-0.05371565611684356,-0.1911795642792075,0.031377041059334296,-0.1878975282317551,0.031377041059334296,0.031377041059334296,-0.05797498715403663,-0.16504719455969272,0.031377041059334296,0.031377041059334296,0.031377041059334296,0.031377041059334296,0.08451680465613182,-1.3788161054314882,0.8705934555463825,-0.14883071032781817,0.4794400032351147,0.37278092798855456,0.031377041059334296,-0.6804527331484471,-0.0616080443190472,0.031377041059334296,0.648036850207841,0.031377041059334296,-0.539070393489393,0.031377041059334296,-0.01347642046768677,-0.09768160504660672,0.031377041059334296,0.031377041059334296,0.16712054960936915,0.031377041059334296,0.031377041059334296,0.3714470026227119,-0.595721256020614,-0.3681332795227013,0.4794400032351147,0.031377041059334296,0.031377041059334296,0.08650028676681494,-0.4761262801351043,0.031377041059334296,-0.16023399648807046,-1.7842765442758974,-0.09856911764386461,0.2992228713677116,0.7560298508202437,0.19449328737921784,0.4794400032351147,0.647143371558051,0.7518242187853139,0.031377041059334296,-0.08318656366680464,-0.7554689678168951,0.08650028676681494,0.3714470026227119,0.8091696144170412,0.16712054960936915,-0.2778557502841216,-0.023111740809920317,0.17159298394299272,-0.08318656366680464,-0.8795536139595987,-0.5626635746079205,0.4821771381472392,0.32293086317280634,-0.09856911764386461,-0.3681332795227013,0.031377041059334296,0.031377041059334296,0.2934320954381532,-0.6804527331484471,0.648036850207841,0.3714470026227119,-0.2565674488890102,-0.09856911764386461,0.031377041059334296,-0.0616080443190472,0.031377041059334296,-0.3205430105291838,0.3714470026227119,0.7560298508202437,0.19449328737921784,-1.0393415443741953,0.30182362490635994,0.031377041059334296,0.4794400032351147,0.031377041059334296,0.7560298508202437,-0.20390024796193348,0.5325797668319123,-0.16023399648807046,-0.4299388932422404,-0.699968901912912,-0.23857600092819403,0.031377041059334296,0.031377041059334296,0.648036850207841,-0.09856911764386461,0.031377041059334296,-0.2438012044320959,0.16712054960936915,-0.0470220019219418,-0.18703363325997763,0.031377041059334296,0.031377041059334296,0.7560298508202437,-0.4761262801351043,0.031377041059334296,-0.7377981287801743,0.031377041059334296,0.5509566452263485,-0.2507977408432871,-0.08318656366680464,0.031377041059334296,0.24349515846643388,-1.5679926772165484,-0.030046800070007075,0.031377041059334296,0.031377041059334296,-0.1911795642792075,-0.7742566237269481,0.031377041059334296,0.031377041059334296,0.031377041059334296,-0.37285985053803683,0.031377041059334296,-1.7109494798618035,-0.1898456389133647,0.031377041059334296,-0.0903068537913673,0.19449328737921784,0.031377041059334296,0.16712054960936915,0.031377041059334296,-0.22120729499512268,0.031377041059334296,-0.3681332795227013,0.08451680465613182,0.031377041059334296,0.031377041059334296,0.1003217059496567,0.031377041059334296,0.031377041059334296,0.7560298508202437,0.7776083701680012,0.4794400032351147,-0.4761262801351043,0.031377041059334296,-0.4761262801351043,-0.14628732301159214,0.031377041059334296,0.018253849946723136,0.031377041059334296,0.3714470026227119,-0.08318656366680464,0.031377041059334296,-0.05371565611684356,0.4794400032351147,0.031377041059334296,0.0014927150257402566,0.031377041059334296,0.031377041059334296,-0.0861533265769483,0.031377041059334296,-0.0903068537913673,0.031377041059334296,-0.8476102904638176,-0.3741737025332788,-0.16319926542004284,-0.611727293568274,0.031377041059334296,0.031377041059334296,0.3630901343519441,0.031377041059334296,0.3714470026227119,-0.11782616624652774,0.5189512603622484,-0.10516636022081982,0.7560298508202437,0.8705934555463825,0.4794400032351147,0.19449328737921784,0.30705639858322686,0.031377041059334296,0.5071873047795653,-1.5929989770099864,-0.6281673439011544,0.3714470026227119,-0.01347642046768677,0.031377041059334296,0.08650028676681494,-0.11782616624652774,0.031377041059334296,0.031377041059334296,0.031377041059334296,-0.2501373385281602,-0.17926712569805908,0.031377041059334296,0.031377041059334296,-0.6281673439011544,0.031377041059334296,-0.5334716757668315,0.031377041059334296,0.08650028676681494,0.3627811524999705,0.031377041059334296,0.3714470026227119,0.031377041059334296,0.031377041059334296,0.031377041059334296,0.031377041059334296,0.031377041059334296,-0.6281673439011544,0.48781491596129917,0.031377041059334296,0.031377041059334296,0.8705934555463825,0.5850927368535522,0.19449328737921784,0.031377041059334296,0.648036850207841,0.031377041059334296,-0.023111740809920317,0.23270571998022935,-0.6281673439011544,0.031377041059334296,0.031377041059334296,-0.3681332795227013,0.031377041059334296,0.031377041059334296,0.031377041059334296,-0.0616080443190472,-0.3681332795227013,0.031377041059334296,0.031377041059334296,-0.4761262801351043,0.08451680465613182,0.47074322286347137,-0.4761262801351043,-0.46445068218287233,0.031377041059334296,-0.6779683221223746,0.12471271936782648,0.031377041059334296,-0.6281673439011544,0.3001040218792932,0.03716781698889269,0.5092955675653585,0.031377041059334296,-1.2770496845555073,-0.6930466566888841,-0.6804527331484471,0.4245867662195094,0.19449328737921784,0.031377041059334296,0.8091696144170412,0.031377041059334296,0.031377041059334296,0.125977740748939,0.031377041059334296,0.031377041059334296,-0.2875130166801472,-0.22120729499512268,-1.807770904288872,0.031377041059334296,-0.4761262801351043,0.031377041059334296,0.031377041059334296,-0.0616080443190472,0.031377041059334296,0.031377041059334296,-0.08318656366680464,0.8705934555463825,0.19449328737921784,-0.6281673439011544,-0.01347642046768677,0.3714470026227119,0.04638495629335555,-0.5936566477713867,0.4794400032351147,0.031377041059334296,0.4794400032351147,0.031377041059334296,0.4794400032351147,-1.6636004921945649,-0.1911795642792075,0.031377041059334296,0.031377041059334296,-0.6281673439011544,-0.07528203418722577,0.031377041059334296,-0.09856911764386461,-0.05371565611684356,-0.09856911764386461,-0.30897691464910143,0.031377041059334296,-0.39550601729254997,-0.9271870164033322,-0.09524538985133613,0.031377041059334296,0.031377041059334296,-0.48105703977467085,0.4794400032351147,0.031377041059334296,-0.36136937234305894,0.08650028676681494,0.031377041059334296,0.031377041059334296,-0.13803980068240984,0.031377041059334296,0.13937004167173714,0.648036850207841,0.12471271936782648,-0.0616080443190472,0.031377041059334296,-0.0616080443190472,-0.04336074650128082,-0.6077403371351021,0.031377041059334296,0.031377041059334296,0.031377041059334296,0.31632375691523107,0.031377041059334296,0.49305164681148106,-0.3500384066013138,0.031377041059334296,-0.7002666539530162,0.4794400032351147,-0.06107479968382608,0.5325797668319123,0.8705934555463825,-0.08318656366680464,0.4044616098724823,0.21498180326608193,0.08650028676681494,-0.3681332795227013,0.19449328737921784,-0.3681332795227013,0.5517033978069008,0.4794400032351147,0.23270571998022935,0.43130837938375416,0.4794400032351147,0.031377041059334296,-0.699968901912912,0.3714470026227119,-0.0861533265769483,0.031377041059334296,0.8369169159614919,-0.6804527331484471,0.031377041059334296,0.26189432794333145,-0.4866239577046773],"type":"scatter3d"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"scene":{"domain":{"x":[0.0,1.0],"y":[0.0,1.0]},"xaxis":{"title":{"text":"z1"}},"yaxis":{"title":{"text":"z2"}},"zaxis":{"title":{"text":"z3"}}},"legend":{"tracegroupgap":0},"title":{"text":"Vote Data"},"height":600,"width":800},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('61584a60-7e10-4a16-8027-47cae7419549');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>Baesd on the plot above, it looks like there are two clusters of datapoints. What do you think this corresponds to?</p>
+<p>By incorporating member information (<a href="https://github.com/unitedstates/congress-legislators">source</a>), we can augment our graph with biographic data like each member’s party and gender.</p>
+<div id="68544fa3" class="cell" data-execution_count="19">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>legislators_data <span class="op">=</span> yaml.safe_load(<span class="bu">open</span>(<span class="st">"data/legislators-2019.yaml"</span>))</span>
+<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> to_date(s):</span>
+<span id="cb25-5"><a href="#cb25-5" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> datetime.strptime(s, <span class="st">"%Y-%m-</span><span class="sc">%d</span><span class="st">"</span>)</span>
+<span id="cb25-6"><a href="#cb25-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-7"><a href="#cb25-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-8"><a href="#cb25-8" aria-hidden="true" tabindex="-1"></a>legs <span class="op">=</span> pd.DataFrame(</span>
+<span id="cb25-9"><a href="#cb25-9" aria-hidden="true" tabindex="-1"></a>    columns<span class="op">=</span>[</span>
+<span id="cb25-10"><a href="#cb25-10" aria-hidden="true" tabindex="-1"></a>        <span class="st">"leg_id"</span>,</span>
+<span id="cb25-11"><a href="#cb25-11" aria-hidden="true" tabindex="-1"></a>        <span class="st">"first"</span>,</span>
+<span id="cb25-12"><a href="#cb25-12" aria-hidden="true" tabindex="-1"></a>        <span class="st">"last"</span>,</span>
+<span id="cb25-13"><a href="#cb25-13" aria-hidden="true" tabindex="-1"></a>        <span class="st">"gender"</span>,</span>
+<span id="cb25-14"><a href="#cb25-14" aria-hidden="true" tabindex="-1"></a>        <span class="st">"state"</span>,</span>
+<span id="cb25-15"><a href="#cb25-15" aria-hidden="true" tabindex="-1"></a>        <span class="st">"chamber"</span>,</span>
+<span id="cb25-16"><a href="#cb25-16" aria-hidden="true" tabindex="-1"></a>        <span class="st">"party"</span>,</span>
+<span id="cb25-17"><a href="#cb25-17" aria-hidden="true" tabindex="-1"></a>        <span class="st">"birthday"</span>,</span>
+<span id="cb25-18"><a href="#cb25-18" aria-hidden="true" tabindex="-1"></a>    ],</span>
+<span id="cb25-19"><a href="#cb25-19" aria-hidden="true" tabindex="-1"></a>    data<span class="op">=</span>[</span>
+<span id="cb25-20"><a href="#cb25-20" aria-hidden="true" tabindex="-1"></a>        [</span>
+<span id="cb25-21"><a href="#cb25-21" aria-hidden="true" tabindex="-1"></a>            x[<span class="st">"id"</span>][<span class="st">"bioguide"</span>],</span>
+<span id="cb25-22"><a href="#cb25-22" aria-hidden="true" tabindex="-1"></a>            x[<span class="st">"name"</span>][<span class="st">"first"</span>],</span>
+<span id="cb25-23"><a href="#cb25-23" aria-hidden="true" tabindex="-1"></a>            x[<span class="st">"name"</span>][<span class="st">"last"</span>],</span>
+<span id="cb25-24"><a href="#cb25-24" aria-hidden="true" tabindex="-1"></a>            x[<span class="st">"bio"</span>][<span class="st">"gender"</span>],</span>
+<span id="cb25-25"><a href="#cb25-25" aria-hidden="true" tabindex="-1"></a>            x[<span class="st">"terms"</span>][<span class="op">-</span><span class="dv">1</span>][<span class="st">"state"</span>],</span>
+<span id="cb25-26"><a href="#cb25-26" aria-hidden="true" tabindex="-1"></a>            x[<span class="st">"terms"</span>][<span class="op">-</span><span class="dv">1</span>][<span class="st">"type"</span>],</span>
+<span id="cb25-27"><a href="#cb25-27" aria-hidden="true" tabindex="-1"></a>            x[<span class="st">"terms"</span>][<span class="op">-</span><span class="dv">1</span>][<span class="st">"party"</span>],</span>
+<span id="cb25-28"><a href="#cb25-28" aria-hidden="true" tabindex="-1"></a>            to_date(x[<span class="st">"bio"</span>][<span class="st">"birthday"</span>]),</span>
+<span id="cb25-29"><a href="#cb25-29" aria-hidden="true" tabindex="-1"></a>        ]</span>
+<span id="cb25-30"><a href="#cb25-30" aria-hidden="true" tabindex="-1"></a>        <span class="cf">for</span> x <span class="kw">in</span> legislators_data</span>
+<span id="cb25-31"><a href="#cb25-31" aria-hidden="true" tabindex="-1"></a>    ],</span>
+<span id="cb25-32"><a href="#cb25-32" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb25-33"><a href="#cb25-33" aria-hidden="true" tabindex="-1"></a>legs[<span class="st">"age"</span>] <span class="op">=</span> <span class="dv">2024</span> <span class="op">-</span> legs[<span class="st">"birthday"</span>].dt.year</span>
+<span id="cb25-34"><a href="#cb25-34" aria-hidden="true" tabindex="-1"></a>legs.set_index(<span class="st">"leg_id"</span>)</span>
+<span id="cb25-35"><a href="#cb25-35" aria-hidden="true" tabindex="-1"></a>legs.sort_index()</span>
+<span id="cb25-36"><a href="#cb25-36" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-37"><a href="#cb25-37" aria-hidden="true" tabindex="-1"></a>vote_2d <span class="op">=</span> vote_2d.join(legs.set_index(<span class="st">"leg_id"</span>)).dropna()</span>
+<span id="cb25-38"><a href="#cb25-38" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-39"><a href="#cb25-39" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">42</span>)</span>
+<span id="cb25-40"><a href="#cb25-40" aria-hidden="true" tabindex="-1"></a>vote_2d[<span class="st">"z1_jittered"</span>] <span class="op">=</span> vote_2d[<span class="st">"z1"</span>] <span class="op">+</span> np.random.normal(<span class="dv">0</span>, <span class="fl">0.1</span>, <span class="bu">len</span>(vote_2d))</span>
+<span id="cb25-41"><a href="#cb25-41" aria-hidden="true" tabindex="-1"></a>vote_2d[<span class="st">"z2_jittered"</span>] <span class="op">=</span> vote_2d[<span class="st">"z2"</span>] <span class="op">+</span> np.random.normal(<span class="dv">0</span>, <span class="fl">0.1</span>, <span class="bu">len</span>(vote_2d))</span>
+<span id="cb25-42"><a href="#cb25-42" aria-hidden="true" tabindex="-1"></a>vote_2d[<span class="st">"z3_jittered"</span>] <span class="op">=</span> vote_2d[<span class="st">"z3"</span>] <span class="op">+</span> np.random.normal(<span class="dv">0</span>, <span class="fl">0.1</span>, <span class="bu">len</span>(vote_2d))</span>
+<span id="cb25-43"><a href="#cb25-43" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-44"><a href="#cb25-44" aria-hidden="true" tabindex="-1"></a>px.scatter_3d(vote_2d, x<span class="op">=</span><span class="st">'z1_jittered'</span>, y<span class="op">=</span><span class="st">'z2_jittered'</span>, z<span class="op">=</span><span class="st">'z3_jittered'</span>, color<span class="op">=</span><span class="st">'party'</span>, symbol<span class="op">=</span><span class="st">"gender"</span>, size<span class="op">=</span><span class="st">'age'</span>,</span>
+<span id="cb25-45"><a href="#cb25-45" aria-hidden="true" tabindex="-1"></a>           title<span class="op">=</span><span class="st">'Vote Data'</span>, width<span class="op">=</span><span class="dv">800</span>, height<span class="op">=</span><span class="dv">600</span>, size_max<span class="op">=</span><span class="dv">10</span>,</span>
+<span id="cb25-46"><a href="#cb25-46" aria-hidden="true" tabindex="-1"></a>           opacity <span class="op">=</span> <span class="fl">0.7</span>,</span>
+<span id="cb25-47"><a href="#cb25-47" aria-hidden="true" tabindex="-1"></a>           color_discrete_map<span class="op">=</span>{<span class="st">'Democrat'</span>:<span class="st">'blue'</span>, <span class="st">'Republican'</span>:<span class="st">'red'</span>, <span class="st">"Independent"</span>: <span class="st">"green"</span>},</span>
+<span id="cb25-48"><a href="#cb25-48" aria-hidden="true" tabindex="-1"></a>           hover_data<span class="op">=</span>[<span class="st">'first'</span>, <span class="st">'last'</span>, <span class="st">'state'</span>, <span class="st">'party'</span>, <span class="st">'gender'</span>, <span class="st">'age'</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="bc0276b3-210d-4d70-a9b8-bd06b7f14cba" class="plotly-graph-div" style="height:600px; width:800px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("bc0276b3-210d-4d70-a9b8-bd06b7f14cba")) {                    Plotly.newPlot(                        "bc0276b3-210d-4d70-a9b8-bd06b7f14cba",                        [{"customdata":[["Robert","Aderholt","AL","Republican","M",59.0],["Mark","Amodei","NV","Republican","M",66.0],["Rick","Allen","GA","Republican","M",73.0],["Ralph","Abraham","LA","Republican","M",70.0],["Jodey","Arrington","TX","Republican","M",52.0],["Kelly","Armstrong","ND","Republican","M",48.0],["Kevin","Brady","TX","Republican","M",69.0],["Michael","Burgess","TX","Republican","M",74.0],["Rob","Bishop","UT","Republican","M",73.0],["Gus","Bilirakis","FL","Republican","M",61.0],["Vern","Buchanan","FL","Republican","M",73.0],["Mo","Brooks","AL","Republican","M",70.0],["Larry","Bucshon","IN","Republican","M",62.0],["Garland","Barr","KY","Republican","M",51.0],["Bradley","Byrne","AL","Republican","M",69.0],["Brian","Babin","TX","Republican","M",76.0],["Mike","Bost","IL","Republican","M",64.0],["Ken","Buck","CO","Republican","M",65.0],["Don","Bacon","NE","Republican","M",61.0],["Jim","Banks","IN","Republican","M",45.0],["Jack","Bergman","MI","Republican","M",77.0],["Andy","Biggs","AZ","Republican","M",66.0],["Ted","Budd","NC","Republican","M",53.0],["Troy","Balderson","OH","Republican","M",62.0],["James","Baird","IN","Republican","M",79.0],["Tim","Burchett","TN","Republican","M",60.0],["Dan","Bishop","NC","Republican","M",60.0],["Ken","Calvert","CA","Republican","M",71.0],["Steve","Chabot","OH","Republican","M",71.0],["John","Carter","TX","Republican","M",83.0],["Tom","Cole","OK","Republican","M",75.0],["K.","Conaway","TX","Republican","M",76.0],["Eric","Crawford","AR","Republican","M",58.0],["Doug","Collins","GA","Republican","M",58.0],["Paul","Cook","CA","Republican","M",81.0],["Buddy","Carter","GA","Republican","M",67.0],["James","Comer","KY","Republican","M",52.0],["John","Curtis","UT","Republican","M",64.0],["Michael","Cloud","TX","Republican","M",49.0],["Ben","Cline","VA","Republican","M",52.0],["Dan","Crenshaw","TX","Republican","M",40.0],["Mario","Diaz-Balart","FL","Republican","M",63.0],["Jeff","Duncan","SC","Republican","M",58.0],["Scott","DesJarlais","TN","Republican","M",60.0],["Rodney","Davis","IL","Republican","M",54.0],["Warren","Davidson","OH","Republican","M",54.0],["Neal","Dunn","FL","Republican","M",71.0],["Tom","Emmer","MN","Republican","M",63.0],["Ron","Estes","KS","Republican","M",68.0],["Jeff","Fortenberry","NE","Republican","M",64.0],["Charles","Fleischmann","TN","Republican","M",62.0],["Bill","Flores","TX","Republican","M",70.0],["A.","Ferguson","GA","Republican","M",58.0],["Brian","Fitzpatrick","PA","Republican","M",51.0],["Russ","Fulcher","ID","Republican","M",51.0],["Sam","Graves","MO","Republican","M",61.0],["Louie","Gohmert","TX","Republican","M",71.0],["Brett","Guthrie","KY","Republican","M",60.0],["Tom","Graves","GA","Republican","M",54.0],["Bob","Gibbs","OH","Republican","M",70.0],["Paul","Gosar","AZ","Republican","M",66.0],["H.","Griffith","VA","Republican","M",66.0],["Glenn","Grothman","WI","Republican","M",69.0],["Garret","Graves","LA","Republican","M",52.0],["Matt","Gaetz","FL","Republican","M",42.0],["Mike","Gallagher","WI","Republican","M",40.0],["Greg","Gianforte","MT","Republican","M",63.0],["Anthony","Gonzalez","OH","Republican","M",40.0],["Lance","Gooden","TX","Republican","M",42.0],["Mark","Green","TN","Republican","M",60.0],["Michael","Guest","MS","Republican","M",54.0],["Duncan","Hunter","CA","Republican","M",48.0],["Andy","Harris","MD","Republican","M",67.0],["Bill","Huizenga","MI","Republican","M",55.0],["George","Holding","NC","Republican","M",56.0],["Richard","Hudson","NC","Republican","M",53.0],["Jody","Hice","GA","Republican","M",64.0],["French","Hill","AR","Republican","M",68.0],["Will","Hurd","TX","Republican","M",47.0],["Trey","Hollingsworth","IN","Republican","M",41.0],["Clay","Higgins","LA","Republican","M",63.0],["Kevin","Hern","OK","Republican","M",63.0],["Jim","Hagedorn","MN","Republican","M",62.0],["Jim","Jordan","OH","Republican","M",60.0],["Bill","Johnson","OH","Republican","M",70.0],["David","Joyce","OH","Republican","M",67.0],["Mike","Johnson","LA","Republican","M",52.0],["Dusty","Johnson","SD","Republican","M",48.0],["John","Joyce","PA","Republican","M",67.0],["Peter","King","NY","Republican","M",80.0],["Steve","King","IA","Republican","M",75.0],["Mike","Kelly","PA","Republican","M",76.0],["Adam","Kinzinger","IL","Republican","M",46.0],["John","Katko","NY","Republican","M",62.0],["Trent","Kelly","MS","Republican","M",58.0],["David","Kustoff","TN","Republican","M",58.0],["Fred","Keller","PA","Republican","M",59.0],["Frank","Lucas","OK","Republican","M",64.0],["Doug","Lamborn","CO","Republican","M",70.0],["Robert","Latta","OH","Republican","M",68.0],["Blaine","Luetkemeyer","MO","Republican","M",72.0],["Billy","Long","MO","Republican","M",69.0],["Doug","LaMalfa","CA","Republican","M",64.0],["Barry","Loudermilk","GA","Republican","M",61.0],["Darin","LaHood","IL","Republican","M",56.0],["Patrick","McHenry","NC","Republican","M",49.0],["Michael","McCaul","TX","Republican","M",62.0],["Kenny","Marchant","TX","Republican","M",73.0],["Kevin","McCarthy","CA","Republican","M",59.0],["Tom","McClintock","CA","Republican","M",68.0],["David","McKinley","WV","Republican","M",77.0],["Thomas","Massie","KY","Republican","M",53.0],["Mark","Meadows","NC","Republican","M",65.0],["Markwayne","Mullin","OK","Republican","M",47.0],["John","Moolenaar","MI","Republican","M",63.0],["Alex","Mooney","WV","Republican","M",53.0],["Roger","Marshall","KS","Republican","M",64.0],["Brian","Mast","FL","Republican","M",44.0],["Paul","Mitchell","MI","Republican","M",63.0],["Daniel","Meuser","PA","Republican","M",60.0],["Gregory","Murphy","NC","Republican","M",61.0],["Devin","Nunes","CA","Republican","M",51.0],["Dan","Newhouse","WA","Republican","M",69.0],["Ralph","Norman","SC","Republican","M",71.0],["Pete","Olson","TX","Republican","M",62.0],["Bill","Posey","FL","Republican","M",77.0],["Steven","Palazzo","MS","Republican","M",54.0],["Scott","Perry","PA","Republican","M",62.0],["Gary","Palmer","AL","Republican","M",70.0],["Greg","Pence","IN","Republican","M",68.0],["Harold","Rogers","KY","Republican","M",87.0],["Mike","Rogers","AL","Republican","M",66.0],["David","Roe","TN","Republican","M",79.0],["Tom","Reed","NY","Republican","M",53.0],["Tom","Rice","SC","Republican","M",67.0],["John","Ratcliffe","TX","Republican","M",59.0],["David","Rouzer","NC","Republican","M",52.0],["Francis","Rooney","FL","Republican","M",71.0],["John","Rutherford","FL","Republican","M",72.0],["Guy","Reschenthaler","PA","Republican","M",41.0],["Denver","Riggleman","VA","Republican","M",54.0],["John","Rose","TN","Republican","M",59.0],["Chip","Roy","TX","Republican","M",52.0],["F.","Sensenbrenner","WI","Republican","M",81.0],["John","Shimkus","IL","Republican","M",66.0],["Christopher","Smith","NJ","Republican","M",71.0],["Michael","Simpson","ID","Republican","M",74.0],["Adrian","Smith","NE","Republican","M",54.0],["Steve","Scalise","LA","Republican","M",59.0],["David","Schweikert","AZ","Republican","M",62.0],["Steve","Stivers","OH","Republican","M",59.0],["Austin","Scott","GA","Republican","M",55.0],["Chris","Stewart","UT","Republican","M",64.0],["Jason","Smith","MO","Republican","M",44.0],["Lloyd","Smucker","PA","Republican","M",60.0],["Ross","Spano","FL","Republican","M",58.0],["Pete","Stauber","MN","Republican","M",58.0],["Bryan","Steil","WI","Republican","M",43.0],["W.","Steube","FL","Republican","M",46.0],["Mac","Thornberry","TX","Republican","M",66.0],["Michael","Turner","OH","Republican","M",64.0],["Glenn","Thompson","PA","Republican","M",65.0],["Scott","Tipton","CO","Republican","M",68.0],["Van","Taylor","TX","Republican","M",52.0],["William","Timmons","SC","Republican","M",40.0],["Fred","Upton","MI","Republican","M",71.0],["Greg","Walden","OR","Republican","M",67.0],["Joe","Wilson","SC","Republican","M",77.0],["Tim","Walberg","MI","Republican","M",73.0],["Robert","Wittman","VA","Republican","M",65.0],["Daniel","Webster","FL","Republican","M",75.0],["Steve","Womack","AR","Republican","M",67.0],["Rob","Woodall","GA","Republican","M",54.0],["Randy","Weber","TX","Republican","M",71.0],["Brad","Wenstrup","OH","Republican","M",66.0],["Roger","Williams","TX","Republican","M",75.0],["Mark","Walker","NC","Republican","M",55.0],["Bruce","Westerman","AR","Republican","M",57.0],["Michael","Waltz","FL","Republican","M",50.0],["Steven","Watkins","KS","Republican","M",48.0],["Ron","Wright","TX","Republican","M",71.0],["Don","Young","AK","Republican","M",91.0],["Ted","Yoho","FL","Republican","M",69.0],["Lee","Zeldin","NY","Republican","M",44.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003ez3_jittered=%{z}\u003cbr\u003eage=%{customdata[5]}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Republican, M","marker":{"color":"red","opacity":0.7,"size":[59.0,66.0,73.0,70.0,52.0,48.0,69.0,74.0,73.0,61.0,73.0,70.0,62.0,51.0,69.0,76.0,64.0,65.0,61.0,45.0,77.0,66.0,53.0,62.0,79.0,60.0,60.0,71.0,71.0,83.0,75.0,76.0,58.0,58.0,81.0,67.0,52.0,64.0,49.0,52.0,40.0,63.0,58.0,60.0,54.0,54.0,71.0,63.0,68.0,64.0,62.0,70.0,58.0,51.0,51.0,61.0,71.0,60.0,54.0,70.0,66.0,66.0,69.0,52.0,42.0,40.0,63.0,40.0,42.0,60.0,54.0,48.0,67.0,55.0,56.0,53.0,64.0,68.0,47.0,41.0,63.0,63.0,62.0,60.0,70.0,67.0,52.0,48.0,67.0,80.0,75.0,76.0,46.0,62.0,58.0,58.0,59.0,64.0,70.0,68.0,72.0,69.0,64.0,61.0,56.0,49.0,62.0,73.0,59.0,68.0,77.0,53.0,65.0,47.0,63.0,53.0,64.0,44.0,63.0,60.0,61.0,51.0,69.0,71.0,62.0,77.0,54.0,62.0,70.0,68.0,87.0,66.0,79.0,53.0,67.0,59.0,52.0,71.0,72.0,41.0,54.0,59.0,52.0,81.0,66.0,71.0,74.0,54.0,59.0,62.0,59.0,55.0,64.0,44.0,60.0,58.0,58.0,43.0,46.0,66.0,64.0,65.0,68.0,52.0,40.0,71.0,67.0,77.0,73.0,65.0,75.0,67.0,54.0,71.0,66.0,75.0,55.0,57.0,50.0,48.0,71.0,91.0,69.0,44.0],"sizemode":"area","sizeref":0.91,"symbol":"circle"},"mode":"markers","name":"Republican, M","scene":"scene","showlegend":true,"x":[3.111027718171449,2.9091388272084986,3.1535821010058314,1.405055117494857,3.0822297250093995,2.8603913666912812,2.8700282784045417,2.891131430576587,2.9015138033471595,2.429522764657823,2.0189383887404793,3.273428269281422,3.041045583779239,2.8650696727530143,3.201307764009005,2.966451334265011,2.6926464497118214,3.1141004794852436,2.4180377022923696,3.094268409602153,2.8938033905981086,3.2553872553622205,3.165430968461919,3.043271680542748,2.8121572681920837,3.184413261892354,1.8013628310615322,2.982345548954318,3.0006917811644698,3.1644562551199176,2.9374979853100265,3.1589108155825585,2.690548482449458,2.9829929732561693,2.9540742843660803,3.0700610096941396,3.0723143359509977,3.2211714545758285,3.149817096265061,3.1803014346298792,2.9928845781947953,1.919790729919587,3.134931264424207,3.1512131978023565,2.6778676010190137,3.119672852280768,2.7731989307217515,2.9531629831222466,3.187664331553179,3.0270869147017763,3.0164282656512844,3.0634155359794044,2.7472386784148357,0.8132681201714701,3.2126851150772526,3.2243550397642613,3.397954992344498,3.0314170622073235,2.834656124627862,3.09612804356209,3.3611914144165755,2.7622599254327627,2.871872527103174,2.939928142740083,1.4493139133055952,2.642788781277922,2.8799812478396225,2.8805736233000903,3.149598363088763,2.954251700381412,2.82276375762477,3.114660142795327,3.288034833596099,2.4170961891242553,3.0954147682041993,3.239944015078314,3.284362442996028,2.8051686405629037,2.9415228411967895,2.437344829076938,2.781491376657692,2.9951258211914733,2.97033201134624,3.2914686970289972,2.865779347811427,2.723798677625144,2.8204368993705797,2.350644922600238,3.069444085672491,1.9118317011175643,3.196576659537101,2.8662269632373145,2.5190952238601194,1.607103100563066,3.1372511917127617,2.984073781416565,2.874471894105975,3.016609739034923,3.2737885924901176,3.1986416556589403,2.848927157388788,3.0587643823309394,2.9535724591399912,3.1586253835812994,2.984371139984621,2.6871707505870734,2.875732880209355,2.1887216678556722,3.0309575199817873,2.9806890029438686,2.8014162194522867,2.6444532841625326,3.1805720024573336,3.3402369270939065,2.918342165074259,3.061198253861815,3.1695552221159504,1.488306535733959,3.074405356203636,2.9776485258286045,1.8542522900541758,2.915686106901275,2.895472604289198,2.9254038599412198,3.014445574407256,3.099525646440894,3.059266143473908,3.229981560576427,3.0029024265612088,3.0293346835975354,2.980400355246316,2.903885675942098,2.7437682352484365,2.5142382957410154,2.9819199269107486,2.9426284040738997,2.9842385415492942,0.36156275169248175,2.140563659486198,2.661857588020315,3.1393235169209226,3.004983847589925,3.2548458601375243,2.6287295281169087,2.772503035177445,0.7055832591103629,2.9919055950933617,3.0347573499358775,2.9786332085150904,3.0625060352559914,2.920556703607671,3.1368737495121595,3.1782550367789226,3.0996948183151964,3.009809296353372,2.733345406703327,2.597246093585055,2.487546565121396,2.3876918547618686,3.272038180519348,2.665055574941156,3.006934473421908,3.1026018006926375,3.2376023084141257,2.9080539606802716,2.3395882392500273,2.363025783880553,3.3047634788905937,3.1267954644367704,3.075190677257177,2.990148012422382,3.1638173487594563,3.123517804241792,3.1199926113423735,3.3138589524539777,2.926629345746588,3.0833646911168144,3.282838245385702,2.345484661134156,2.869458832325667,3.229144940706533,2.701405650227592,2.2749767341719034,2.464312769847675],"y":[0.3586365064792049,0.8183495700597335,-0.01406004508536178,-3.4662283614362175,-0.015683866190894284,0.39062766320927356,0.2656186717353133,0.41294151749880215,0.5224433220499982,-0.013547540240971144,0.20010251022730796,-0.3930100349810374,0.5590093792386568,0.21726238503446518,-0.5375501680015754,-0.04963855703831399,0.4829832904566167,-0.0742402241881206,0.7458718597578547,0.3027611085278533,0.03906828248050895,-0.39548289035372897,0.020832860905076814,0.16702008045773445,0.594161037052112,-0.6267254242002904,-2.158569472915371,0.5021864369893273,0.2851010684723759,0.27806285623604254,0.9739314525961051,0.27662945103039316,-1.3733412454063008,0.2668105196095797,0.47800279472516205,0.4397303989514445,0.0741216779131818,0.38200073172148524,-0.24712767580235956,-0.5608259688947747,0.41106042967707346,0.04756752495158258,0.19995611617163359,-0.2562560877685339,0.8573322689959654,0.08173717186879773,0.7023805779125014,0.2529813407520479,0.2844088937015296,0.5682427892687405,0.5428667306704573,0.9417404370855339,0.23985631184700731,0.014308588465396163,-0.1854060104486735,0.14907964234153567,-0.6142302082881791,0.38609832175787395,0.42761045930925723,0.44204011680418465,-0.45065041249108156,0.17769228315444757,0.4009075463797147,0.38735892457705096,0.21819075986363307,-0.26433330456289883,0.5625754926742552,0.6621737670447007,-0.002957534378593401,0.22978834090526096,0.07951799389984524,0.3690082028058618,0.0004317328881779381,-0.7380560257239517,0.18182347060052517,-0.15925098573564664,0.06370989805390775,0.4565908835623155,0.1562705485456649,0.6298370562902764,-1.3009221699913396,0.0711767948643433,0.16809950690140915,-0.3564943113391106,0.9745740768152008,0.45781658676942794,-1.2601037472862722,-0.30662705514947936,0.3698686626658577,0.3728824421196423,0.23218681360306223,0.5492403670567954,0.1479384632351214,0.2170796596122878,0.07597568962364279,0.3701131197703177,0.21680856583352737,0.5556553432267276,-0.45234482920692987,0.2409954242148823,0.6142800216283244,0.7844749582695603,-0.18890022282555916,0.4212325517982378,0.4896923754464332,0.18438848412375497,0.33813605509524036,-0.8330171448733403,0.1467229729159631,0.3091744988391928,0.41475314389637363,-0.7638108541668416,-0.32809397800398465,0.17698790009968335,0.23614834111924057,-0.012563500100037761,-0.11938633438525845,-0.6077290428311746,0.2731728038025968,0.6857992720888936,-2.2649290237567805,0.6107917250096732,0.5214931158563052,-0.5124281331952253,0.22982743472198852,-0.19480495291259353,0.36680038138995225,-0.048544422164602835,-0.07656519385652429,0.5127539120530336,0.6288248497202542,0.764055262245362,0.8120130548966744,0.4212394456142295,0.37294445752459393,-0.49630349566399595,-0.2558073929845253,-0.6522027330372441,-0.09954569050169651,0.6404616893476884,0.005660644055446851,0.35674793345886424,-0.4667299531471976,-0.03469340038397324,0.3374854434629436,-0.719083817020812,0.864580866782398,0.3585813505613518,0.36702311398191484,0.2760602396617543,0.5569360900895635,0.07293728222188783,0.13860028212849068,0.2419625175224715,0.5551275090492538,0.43803525932363385,0.6063057795449837,0.8112649007696515,-0.6765418823805,0.06410090224979256,-0.41193646393636263,0.44102403278381364,0.19978271208208243,0.17908000843215144,0.09602898988716387,0.7008680151900627,0.7122784380621148,0.05264819223446762,-0.03881487510878939,0.27930130863659447,-0.7143228847142508,0.6387622656627925,0.6306335394358968,-0.11610371899704058,0.0551080359166808,-0.05418356955092786,-0.3023197404951854,0.14983258396382193,0.7582808477770422,0.3099699897340868,-0.9630119334140965,0.7098627096427956,-0.8894837540376336,0.5480711164231781],"z":[0.19285099777194142,-0.22343303754774108,0.5838560909420253,-0.5599781949385231,0.6939950618423116,-0.4738430566127995,0.2452206904899478,0.02348090529211011,0.20342984581138257,-0.5370577275177936,-1.27238503162249,0.8218275104554734,-0.1648801307655286,0.12758720166130866,0.8837949710197633,0.3396619683358796,-0.7112305566437469,0.2455631367235944,-0.7694416884269847,0.36191744938401643,0.33590320119342576,0.773341744863422,0.586803178221092,0.26879547268160586,-0.17788259686451988,0.9286742773095255,-0.33158581617643657,-0.42402546424986015,0.5171611907415667,0.04969189733759344,-0.6960068757971125,0.2653289321091532,0.10747382701907414,0.3389665285663111,-0.5372541928846143,0.18627816953996096,0.7597664333666537,0.4357192786094663,0.6654950198451555,0.8800699901758924,-0.047167842734260196,-1.621204038094384,0.5554814688495444,0.4513609438536378,-0.7771503474613685,0.5322003812885867,-0.45133416443182583,0.09892212480958928,0.5512156553076641,-0.425251178500981,0.09246332375883236,-0.5408199579056616,-0.12088545794589549,-1.6947572222731242,0.7025063296641869,0.49919996370435465,0.6829054369763571,-0.06339548532053986,0.03811170336138278,0.3394122718032799,0.8515862090572328,0.2194040984129191,0.3858200198041791,0.08956775334326651,-0.8359212469921955,-0.5588632267911005,-0.3938709332650358,-0.7287713793783845,0.8054355265368442,0.2488704363224778,-0.4030049369099922,0.3675961556726412,0.7387671208261617,0.144184409207935,0.6174491386498598,0.7956102041639845,0.558647143414303,-0.21526451187261247,-0.49710123003604556,-0.7025243090120564,-0.12130309900225755,0.7252067212726754,0.13087645546905594,0.7906802389933152,-0.4808183380230227,-0.7301159396741487,0.6505833271710625,-0.3001733991594183,-0.23884475353277979,-1.4829705030051992,-0.064912013510793,0.016495234076876614,-0.7360630785037928,-1.71810960580321,-0.19356786256434885,0.2677572951007758,0.1749840686409782,-0.2683322685367361,0.650688294502878,0.6708431386217933,-0.49519452021143895,-0.3891195070662289,-0.09671913421433169,0.36568512291935334,-0.0630766589953079,-0.8154745689187903,-0.3319816271097503,-0.0018281385141781237,0.48203718324440814,0.43118700960769774,-0.047708892015437904,0.48919491002624915,0.741024292116991,0.46213282080672774,0.1960511921373646,0.19742888989370666,0.6666378111020625,-1.677695111841819,0.15610799126234043,0.25070180278045184,-0.42920769718541635,-0.28949643513290424,0.14289119877205092,0.4222565866664991,0.44733986160138844,0.7242929654880121,0.2863249479794492,0.5490740365508072,0.21024240498052688,-0.5015358290145218,-0.3700970594153694,-0.3397122371377198,-0.4638965298318759,-0.4101208792792335,0.05451355058715092,0.4750616961979871,0.4400050477857678,-1.1875572468610631,-0.7225416245180869,-0.5556785258810737,0.35723770379512254,0.22239270360159805,1.02368452719438,0.18935542884214884,-0.30049508665348984,-1.822471104451721,-0.25507397997596915,0.0807251168776021,0.13913446320841838,0.3335342285239632,-0.651824739166251,0.37796433016582803,0.35704597655888903,0.40244268012925055,-0.34676912889957023,-0.3794781002249423,-0.3396733260403777,-0.9195864772620911,-0.04136979060669921,0.46831739710024467,-0.4349223666020356,0.2101096042878587,-0.24727107714966773,0.6622085415718314,0.3566456733659959,-0.6229873207731564,-0.6614087934999198,0.7287399549783937,0.6164288442770766,-0.0930755289859189,0.49636925817125915,0.11323951817400524,-0.33596349890309996,0.5342074221373599,0.4038654872858302,0.2863567043372697,0.3414615812046504,0.4822581189720275,-0.5913793453297541,0.41891682591070717,0.9759376708632232,-0.6246717025132609,0.1307107047083155,-0.5931353240122146],"type":"scatter3d"},{"customdata":[["Susan","Brooks","IN","Republican","F",64.0],["Liz","Cheney","WY","Republican","F",58.0],["Virginia","Foxx","NC","Republican","F",81.0],["Kay","Granger","TX","Republican","F",81.0],["Jenniffer","Gonz\u00e1lez-Col\u00f3n","PR","Republican","F",48.0],["Vicky","Hartzler","MO","Republican","F",64.0],["Jaime","Herrera Beutler","WA","Republican","F",46.0],["Debbie","Lesko","AZ","Republican","F",66.0],["Cathy","McMorris Rodgers","WA","Republican","F",55.0],["Carol","Miller","WV","Republican","F",74.0],["Martha","Roby","AL","Republican","F",48.0],["Aumua","Amata","AS","Republican","F",77.0],["Elise","Stefanik","NY","Republican","F",40.0],["Ann","Wagner","MO","Republican","F",62.0],["Jackie","Walorski","IN","Republican","F",61.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003ez3_jittered=%{z}\u003cbr\u003eage=%{customdata[5]}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Republican, F","marker":{"color":"red","opacity":0.7,"size":[64.0,58.0,81.0,81.0,48.0,64.0,46.0,66.0,55.0,74.0,48.0,77.0,40.0,62.0,61.0],"sizemode":"area","sizeref":0.91,"symbol":"diamond"},"mode":"markers","name":"Republican, F","scene":"scene","showlegend":true,"x":[2.959075978968458,2.908632108016771,3.0860570512212693,2.955125931497712,2.2870560126995696,3.251035601135717,1.9618607914628134,3.2635713161177553,2.413813078016683,2.747347067342587,2.6998332275779195,1.2459091586594258,1.3609125047899344,3.0364598880224154,3.0451408150929073],"y":[0.5991084323816847,-0.8520746305921856,0.1453643906547607,0.5318350075963971,-2.7180606492406274,0.28681235645851155,0.5839212702300784,0.06393487833714204,0.4342240457776483,0.4784482108144816,0.0643561504778278,-3.3519507605195287,-0.21817214609795985,0.2126168648691457,0.765091757483773],"z":[-0.24028809326197198,0.30072023473680093,0.5366982813707306,0.3262326939426504,0.384282660445848,0.28285928112788505,-0.9741092565328534,0.5981430337911527,-0.566373863416876,-0.01684445731226432,-0.6739091530019863,-0.7684278710811238,-1.6402219010154901,0.12768424201491296,-0.26892904456904904],"type":"scatter3d"},{"customdata":[["Justin","Amash","MI","Independent","M",44.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003ez3_jittered=%{z}\u003cbr\u003eage=%{customdata[5]}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Independent, M","marker":{"color":"green","opacity":0.7,"size":[44.0],"sizemode":"area","sizeref":0.91,"symbol":"circle"},"mode":"markers","name":"Independent, M","scene":"scene","showlegend":true,"x":[0.175043244370895],"y":[-2.3951583837687807],"z":[0.4021190056045903],"type":"scatter3d"},{"customdata":[["Pete","Aguilar","CA","Democrat","M",45.0],["Colin","Allred","TX","Democrat","M",41.0],["Sanford","Bishop","GA","Democrat","M",77.0],["Earl","Blumenauer","OR","Democrat","M",76.0],["George","Butterfield","NC","Democrat","M",77.0],["Ami","Bera","CA","Democrat","M",59.0],["Donald","Beyer","VA","Democrat","M",74.0],["Brendan","Boyle","PA","Democrat","M",47.0],["Anthony","Brown","MD","Democrat","M",63.0],["Anthony","Brindisi","NY","Democrat","M",46.0],["James","Clyburn","SC","Democrat","M",84.0],["Jim","Cooper","TN","Democrat","M",70.0],["Elijah","Cummings","MD","Democrat","M",73.0],["Wm.","Clay","MO","Democrat","M",68.0],["Ed","Case","HI","Democrat","M",72.0],["Jim","Costa","CA","Democrat","M",72.0],["Emanuel","Cleaver","MO","Democrat","M",80.0],["Henry","Cuellar","TX","Democrat","M",69.0],["Steve","Cohen","TN","Democrat","M",75.0],["Joe","Courtney","CT","Democrat","M",71.0],["Andr\u00e9","Carson","IN","Democrat","M",50.0],["Gerald","Connolly","VA","Democrat","M",74.0],["David","Cicilline","RI","Democrat","M",63.0],["Matthew","Cartwright","PA","Democrat","M",63.0],["Joaquin","Castro","TX","Democrat","M",50.0],["Tony","C\u00e1rdenas","CA","Democrat","M",61.0],["J.","Correa","CA","Democrat","M",66.0],["Charlie","Crist","FL","Democrat","M",68.0],["Salud","Carbajal","CA","Democrat","M",60.0],["Sean","Casten","IL","Democrat","M",53.0],["Jason","Crow","CO","Democrat","M",45.0],["Joe","Cunningham","SC","Democrat","M",42.0],["Gilbert","Cisneros","CA","Democrat","M",53.0],["TJ","Cox","CA","Democrat","M",61.0],["Danny","Davis","IL","Democrat","M",83.0],["Peter","DeFazio","OR","Democrat","M",77.0],["Lloyd","Doggett","TX","Democrat","M",78.0],["Michael","Doyle","PA","Democrat","M",71.0],["Theodore","Deutch","FL","Democrat","M",58.0],["Mark","DeSaulnier","CA","Democrat","M",72.0],["Antonio","Delgado","NY","Democrat","M",47.0],["Eliot","Engel","NY","Democrat","M",77.0],["Dwight","Evans","PA","Democrat","M",70.0],["Adriano","Espaillat","NY","Democrat","M",70.0],["Bill","Foster","IL","Democrat","M",69.0],["Ra\u00fal","Grijalva","AZ","Democrat","M",76.0],["Al","Green","TX","Democrat","M",77.0],["John","Garamendi","CA","Democrat","M",79.0],["Ruben","Gallego","AZ","Democrat","M",45.0],["Vicente","Gonzalez","TX","Democrat","M",57.0],["Josh","Gottheimer","NJ","Democrat","M",49.0],["Jimmy","Gomez","CA","Democrat","M",50.0],["Jes\u00fas","Garc\u00eda","IL","Democrat","M",68.0],["Jared","Golden","ME","Democrat","M",42.0],["Alcee","Hastings","FL","Democrat","M",88.0],["Steny","Hoyer","MD","Democrat","M",85.0],["Brian","Higgins","NY","Democrat","M",65.0],["James","Himes","CT","Democrat","M",58.0],["Denny","Heck","WA","Democrat","M",72.0],["Steven","Horsford","NV","Democrat","M",51.0],["Jared","Huffman","CA","Democrat","M",60.0],["Josh","Harder","CA","Democrat","M",38.0],["Henry","Johnson","GA","Democrat","M",70.0],["Hakeem","Jeffries","NY","Democrat","M",54.0],["Ron","Kind","WI","Democrat","M",61.0],["William","Keating","MA","Democrat","M",72.0],["Joseph","Kennedy","MA","Democrat","M",44.0],["Daniel","Kildee","MI","Democrat","M",66.0],["Derek","Kilmer","WA","Democrat","M",50.0],["Ro","Khanna","CA","Democrat","M",48.0],["Raja","Krishnamoorthi","IL","Democrat","M",51.0],["Andy","Kim","NJ","Democrat","M",42.0],["John","Lewis","GA","Democrat","M",84.0],["John","Larson","CT","Democrat","M",76.0],["James","Langevin","RI","Democrat","M",60.0],["Rick","Larsen","WA","Democrat","M",59.0],["Stephen","Lynch","MA","Democrat","M",69.0],["Daniel","Lipinski","IL","Democrat","M",58.0],["David","Loebsack","IA","Democrat","M",72.0],["Ben","Luj\u00e1n","NM","Democrat","M",52.0],["Alan","Lowenthal","CA","Democrat","M",83.0],["Ted","Lieu","CA","Democrat","M",55.0],["Al","Lawson","FL","Democrat","M",76.0],["Conor","Lamb","PA","Democrat","M",40.0],["Andy","Levin","MI","Democrat","M",64.0],["Mike","Levin","CA","Democrat","M",46.0],["James","McGovern","MA","Democrat","M",65.0],["Gregory","Meeks","NY","Democrat","M",71.0],["Jerry","McNerney","CA","Democrat","M",73.0],["Sean","Maloney","NY","Democrat","M",58.0],["Seth","Moulton","MA","Democrat","M",46.0],["A.","McEachin","VA","Democrat","M",63.0],["Tom","Malinowski","NJ","Democrat","M",59.0],["Joseph","Morelle","NY","Democrat","M",67.0],["Ben","McAdams","UT","Democrat","M",50.0],["Jerrold","Nadler","NY","Democrat","M",77.0],["Richard","Neal","MA","Democrat","M",75.0],["Donald","Norcross","NJ","Democrat","M",66.0],["Joe","Neguse","CO","Democrat","M",40.0],["Tom","O\u2019Halleran","AZ","Democrat","M",78.0],["Frank","Pallone","NJ","Democrat","M",73.0],["Bill","Pascrell","NJ","Democrat","M",87.0],["Collin","Peterson","MN","Democrat","M",80.0],["David","Price","NC","Democrat","M",84.0],["Ed","Perlmutter","CO","Democrat","M",71.0],["Donald","Payne","NJ","Democrat","M",66.0],["Mark","Pocan","WI","Democrat","M",60.0],["Scott","Peters","CA","Democrat","M",66.0],["Jimmy","Panetta","CA","Democrat","M",55.0],["Chris","Pappas","NH","Democrat","M",44.0],["Dean","Phillips","MN","Democrat","M",55.0],["Mike","Quigley","IL","Democrat","M",66.0],["Bobby","Rush","IL","Democrat","M",78.0],["C.","Ruppersberger","MD","Democrat","M",78.0],["Tim","Ryan","OH","Democrat","M",51.0],["Cedric","Richmond","LA","Democrat","M",51.0],["Raul","Ruiz","CA","Democrat","M",52.0],["Jamie","Raskin","MD","Democrat","M",62.0],["Max","Rose","NY","Democrat","M",38.0],["Harley","Rouda","CA","Democrat","M",63.0],["Robert","Scott","VA","Democrat","M",77.0],["Jos\u00e9","Serrano","NY","Democrat","M",81.0],["Brad","Sherman","CA","Democrat","M",70.0],["Adam","Smith","WA","Democrat","M",59.0],["Adam","Schiff","CA","Democrat","M",64.0],["David","Scott","GA","Democrat","M",79.0],["Albio","Sires","NJ","Democrat","M",73.0],["John","Sarbanes","MD","Democrat","M",62.0],["Gregorio","Sablan","MP","Democrat","M",69.0],["Kurt","Schrader","OR","Democrat","M",73.0],["Bradley","Schneider","IL","Democrat","M",63.0],["Eric","Swalwell","CA","Democrat","M",44.0],["Darren","Soto","FL","Democrat","M",46.0],["Thomas","Suozzi","NY","Democrat","M",62.0],["Michael","San Nicolas","GU","Democrat","M",43.0],["Greg","Stanton","AZ","Democrat","M",54.0],["Bennie","Thompson","MS","Democrat","M",76.0],["Mike","Thompson","CA","Democrat","M",73.0],["Paul","Tonko","NY","Democrat","M",75.0],["Mark","Takano","CA","Democrat","M",64.0],["David","Trone","MD","Democrat","M",69.0],["Peter","Visclosky","IN","Democrat","M",75.0],["Juan","Vargas","CA","Democrat","M",63.0],["Marc","Veasey","TX","Democrat","M",53.0],["Filemon","Vela","TX","Democrat","M",61.0],["Jefferson","Van Drew","NJ","Democrat","M",71.0],["Peter","Welch","VT","Democrat","M",77.0],["John","Yarmuth","KY","Democrat","M",77.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003ez3_jittered=%{z}\u003cbr\u003eage=%{customdata[5]}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Democrat, M","marker":{"color":"blue","opacity":0.7,"size":[45.0,41.0,77.0,76.0,77.0,59.0,74.0,47.0,63.0,46.0,84.0,70.0,73.0,68.0,72.0,72.0,80.0,69.0,75.0,71.0,50.0,74.0,63.0,63.0,50.0,61.0,66.0,68.0,60.0,53.0,45.0,42.0,53.0,61.0,83.0,77.0,78.0,71.0,58.0,72.0,47.0,77.0,70.0,70.0,69.0,76.0,77.0,79.0,45.0,57.0,49.0,50.0,68.0,42.0,88.0,85.0,65.0,58.0,72.0,51.0,60.0,38.0,70.0,54.0,61.0,72.0,44.0,66.0,50.0,48.0,51.0,42.0,84.0,76.0,60.0,59.0,69.0,58.0,72.0,52.0,83.0,55.0,76.0,40.0,64.0,46.0,65.0,71.0,73.0,58.0,46.0,63.0,59.0,67.0,50.0,77.0,75.0,66.0,40.0,78.0,73.0,87.0,80.0,84.0,71.0,66.0,60.0,66.0,55.0,44.0,55.0,66.0,78.0,78.0,51.0,51.0,52.0,62.0,38.0,63.0,77.0,81.0,70.0,59.0,64.0,79.0,73.0,62.0,69.0,73.0,63.0,44.0,46.0,62.0,43.0,54.0,76.0,73.0,75.0,64.0,69.0,75.0,63.0,53.0,61.0,71.0,77.0,77.0],"sizemode":"area","sizeref":0.91,"symbol":"circle"},"mode":"markers","name":"Democrat, M","scene":"scene","showlegend":true,"x":[-2.6309510022649336,-2.654483103386093,-2.654108640149626,-2.443066946180302,-2.684170455434355,-2.6675995337844784,-2.4138189935874137,-2.7133067576881884,-2.5903988366736006,-2.3551646300155507,0.7320441385293712,-2.6460438928342294,0.4640968784915345,-2.54830114822158,-2.6914574171148615,-1.7994067224196348,-2.5744093216522415,-1.955245235116098,-2.7271563272006647,-2.526283082553178,-2.4719116619355157,-2.6147366769506313,-2.566148069500859,-2.5713961042417566,-2.2758422499046125,-2.8695101752015724,-2.6284027980112987,-2.629502853576349,-2.571824407641423,-2.6883850250819163,-2.660511685169302,-1.3974122812902758,-2.5978279098577937,-2.510671165739309,-2.4977477288926395,-2.4586071664311113,-2.57792363708614,-2.5814301375746087,-2.6068671352288795,-2.569063691424213,-2.449883913066816,-2.3612114535440694,-2.611006841763122,-2.724403468554551,-2.467256233698988,-2.5614571839514437,-2.6858609940262217,-1.5979546320599516,-2.5763233925807802,-1.9353447841683804,-2.248535388226382,-2.5843102950764973,-2.5782284174627295,-2.380648376198812,-2.5288272044183526,-2.4936092192049264,-2.689603896627769,-1.9636999575139407,-2.69648710775515,-2.615245835734008,-2.2212402502147834,-2.2725955185475293,-2.6521871599993,-2.7321095426637965,-1.5208726209752277,-2.745302601588307,-2.556157069701377,-2.556030896161993,-2.2222625157271256,-2.6390625892566324,-2.348577708054456,-2.6312175254665986,-2.5993482508539656,-2.6547288513715412,-2.4986406050958614,-2.4271291538868063,-2.7153101425855284,-2.6790660357185945,-2.5402998365086202,-2.6726956995531794,-2.8100499234583585,-2.6737143112694364,-2.6190093089392876,-2.392984620548739,-2.6550301959086933,-2.672868588049969,-2.5670374936965024,-2.5505630255372873,-2.530072259449664,-2.5299760315651025,-2.5944616070639888,1.3634502111596132,-2.5613253173662707,-2.6005554562935957,-1.862396562428526,-2.7313172146752827,-2.394232327326971,-2.5794364780190944,-2.656835758258481,-2.5225754550904953,-2.5175756772492726,-2.576805712704937,-0.33769157604137606,-2.690435165884805,-2.6635537688122946,-2.5958029264617197,-2.666692803676181,-2.5476019338886187,-2.4976579795938787,-2.524994029893795,-2.476987784077165,-2.640495452765012,-2.597836068293326,-2.6647094117242633,-1.8604323479302798,-2.491724577442591,-2.5447011138661697,-2.5100236914508227,-2.689757704349241,-2.5830390076817253,-2.658229982329711,-2.584330671056834,-2.7523440989423302,-2.4922581448064323,-2.5217697024723957,-2.6113703978959353,-2.7077886012563788,-2.6093869783918366,1.2990684873682345,-2.293958634221637,-2.585126416611556,-2.5977680549377147,-2.462421304013094,-2.5116085821840772,1.462452081695273,-2.667474929337,-1.3859762767698447,-2.665226030358838,-2.4243897882071623,-2.6544532300030683,-2.669406279401044,-2.714297707730857,-2.506134119464307,-2.595506101621408,-2.0952223316158105,-1.6764485049676445,-2.310214863471586,-2.4446741102354688],"y":[0.1190653602087769,0.03398938552732422,0.12170945443274062,-0.5682484416395581,-0.24856212393906912,0.2037281351647902,-0.1592001128229965,-0.023541742059795367,0.11518560501198316,0.45042831364270575,-3.3060145477721563,0.12718010002839808,-2.951049838733728,-0.10624112861027371,0.18186836852852567,-0.11086678209611044,0.10894349635169899,-0.2601037047108851,-0.011879393221854329,0.1842742878291169,0.06291310724787709,0.07926482578359018,-0.17729263076665158,0.04488235251895603,0.09090541439867103,0.03722099795590002,0.07379296236360973,0.20311607649723873,-0.038963163952077384,0.18503723101937575,0.132785858826198,0.2566566800171811,0.03754570164961894,0.1918366100377972,-0.10861206654450847,-0.17279771723563636,0.1327172312917012,0.04193602763166457,-0.4794965371069007,-0.3611424093877645,0.3419742039050498,0.24475557032604056,0.08678966754781306,0.07921674175281046,0.33551744402460365,-0.43733485457561855,-0.09313616674072872,-0.5715630426236284,0.13617573198176963,-0.002173421120737512,0.5332200118930753,0.175875346734699,0.16134357468875465,0.5637934374756213,0.203676225824595,-0.20619449179279808,0.21493795617062583,0.12452405718843403,0.2775817953089781,0.1224188970690143,-0.13178483504081126,0.37873346350872666,0.23739651600715467,0.11217657430216087,-1.1047879211057712,-0.02233693115175081,-0.14171130013409736,0.12254787750827859,0.10488391113919202,0.017324773285892797,0.32212914721255836,0.12937029335135444,0.14778584023684535,0.0736348870466833,0.056692785892343375,0.2976793094700134,0.10247964241797429,0.27837666301832864,-0.554611988497435,0.09370860475202493,0.05019569540097227,0.3032044085024221,0.20014457056438328,0.2078203500517131,0.19927720717729863,0.10394482434424632,0.19907336098040898,0.10921949803617628,0.22038006608074992,-0.033398716396408734,-0.08021557662831241,-3.5428669275062608,0.12421389391757497,0.14300754030624757,0.6623899635535536,-0.006841178050511487,0.2023037735623313,0.16273553469622692,0.14756188966982797,0.168802639731507,0.1752371124013641,0.15358033759144737,0.5000138949689663,0.22941437066187287,-0.056110058973104726,0.17974326620697212,0.14665404060776766,-0.16643184703400224,0.03565403954896937,-0.00719109640157356,0.23334154382084377,-0.06464616928694662,0.1980083521009589,-0.15707385924829842,-1.088426345778975,0.05765972164747106,0.06528065838128307,0.22523280912273277,0.1900445739151276,-0.005557071710950323,0.16599714926065612,0.18395640895316379,0.23605543671552198,-0.16255280517834916,0.14874613287637092,-0.24165281695706653,0.019871710119233568,0.23329264949335016,-3.530220002369647,0.4271436057553596,0.09706015181319587,0.0801025019937121,0.08400425833264231,0.27267515259543307,-3.513540138668064,0.07624851915002259,-0.8035502199370452,0.08595868245047406,0.02756328748485301,0.1245088754213817,-0.11892433034027884,0.1772224905608747,-0.09091014285290815,0.3735073781511492,-0.30264339123279727,0.5308679134631547,-0.47888197952953343,0.3074331505118386],"z":[-0.017566903192488735,0.08976985959193032,0.14224739911762466,0.8596565882877389,0.22273270375988377,-0.10460857303865778,0.14104343814331996,0.05329207382572829,0.050786040349165056,-0.16858119913778194,-0.11969488938920206,0.024802014952035517,-0.6953625780962328,0.12650766262501179,0.07537848606466754,1.1517921264253577,-0.07074624065373704,0.2342509145314606,0.11862277388734876,0.0963971588552004,0.021459454681396005,0.2160407406641009,-0.1756642579023942,0.026818439423784514,0.06150176192116187,-0.09377690135971013,0.08284313890427109,0.08835376908255467,0.07614789706106578,0.05102915802948131,-0.03626218914659268,-0.007803484940673511,0.027361245994899953,-0.11170046915247052,-0.0451645456629577,-0.23315236030718078,-0.013241302262143678,-0.15757703203521883,0.7122031731978101,-0.06637917993324519,-0.010787836568233693,-0.17279644578309478,-0.06878495993015726,0.0032670117707388063,0.17131258471793454,0.8546694588205775,0.20497342137585922,-0.8206107681783399,-0.0255472000360186,0.49418027081906535,-0.20084837416370516,-0.13548136632842472,0.07129935331986223,-0.07612393578786741,0.13608687132054975,0.10678472482673967,-0.014511385227559709,-0.21267492718605424,0.17903107603192386,-0.031179229091618646,-0.1544972293336596,-0.1589889913845562,-0.015393079071854845,0.07908112378164645,0.39357113752986495,-0.0007864640624039645,0.07438120577844126,0.13440538646251854,0.05525595696198565,0.10413999542303229,-0.08511226521063736,0.023305383048476062,-0.16844302739387645,0.24021451153740708,0.01741807824381692,0.21113998762340835,-0.0726135516532188,0.09265443156559126,0.7152314740984207,0.05312032837730658,0.046418931573691924,0.2717185995831618,0.1364424806600955,0.05683693717895483,0.19472019428944393,-0.0832574982425345,-0.04405054394260151,-0.09672068845066656,-0.09138374045436495,0.032404321110342166,-0.11262804720009516,-0.7273065788467851,-0.10093193828141557,-0.03743799345843808,-0.2826204768790225,0.26047129831840843,-0.10758020561702532,0.1698043229103861,0.11671859693736507,0.059496183456271876,-0.04401941782733333,0.003309533372656888,0.47798095328182416,-0.06748206965575747,-0.07898189066224283,-0.12567301930298347,0.1254541598581559,-0.12136048016375128,-0.06545740442702924,0.04191459167393029,-0.028759723335787087,-0.017560570523007295,0.05376845446580044,0.09703925492092255,0.4278026687509461,0.036263048091904726,-0.03491305069576055,-0.040463685497585444,-0.05215766426664144,-0.08738280084361699,0.06235911208203705,0.07275695080448412,0.012848275171563468,-0.2168261477972117,-0.024372137441449074,-0.07043624915417268,0.28934797482476615,-0.04899041573763879,-0.5712690359826268,0.14936324184516706,-0.033550713447266414,0.0347853878852569,0.06446506428078,0.11472993722022677,-0.8275409081583,0.025800133293563694,-0.6224284894651688,-0.059013723087144684,0.092290853150052,-0.0002638039101458961,-0.09251969073929733,-0.04722190499146066,0.18320046155945327,-0.15224349625099537,0.5438507795027354,-0.46037506721870114,0.878776391401139,0.032412303043672354],"type":"scatter3d"},{"customdata":[["Alma","Adams","NC","Democrat","F",78.0],["Cynthia","Axne","IA","Democrat","F",59.0],["Karen","Bass","CA","Democrat","F",71.0],["Suzanne","Bonamici","OR","Democrat","F",70.0],["Joyce","Beatty","OH","Democrat","F",74.0],["Julia","Brownley","CA","Democrat","F",72.0],["Cheri","Bustos","IL","Democrat","F",63.0],["Nanette","Barrag\u00e1n","CA","Democrat","F",48.0],["Lisa","Blunt Rochester","DE","Democrat","F",62.0],["Kathy","Castor","FL","Democrat","F",58.0],["Yvette","Clarke","NY","Democrat","F",60.0],["Judy","Chu","CA","Democrat","F",71.0],["Katherine","Clark","MA","Democrat","F",61.0],["Angie","Craig","MN","Democrat","F",52.0],["Diana","DeGette","CO","Democrat","F",67.0],["Rosa","DeLauro","CT","Democrat","F",81.0],["Susan","Davis","CA","Democrat","F",80.0],["Suzan","DelBene","WA","Democrat","F",62.0],["Debbie","Dingell","MI","Democrat","F",71.0],["Val","Demings","FL","Democrat","F",67.0],["Sharice","Davids","KS","Democrat","F",44.0],["Madeleine","Dean","PA","Democrat","F",65.0],["Anna","Eshoo","CA","Democrat","F",82.0],["Veronica","Escobar","TX","Democrat","F",55.0],["Marcia","Fudge","OH","Democrat","F",72.0],["Lois","Frankel","FL","Democrat","F",76.0],["Abby","Finkenauer","IA","Democrat","F",36.0],["Lizzie","Fletcher","TX","Democrat","F",49.0],["Tulsi","Gabbard","HI","Democrat","F",43.0],["Sylvia","Garcia","TX","Democrat","F",74.0],["Debra","Haaland","NM","Democrat","F",64.0],["Jahana","Hayes","CT","Democrat","F",51.0],["Kendra","Horn","OK","Democrat","F",48.0],["Chrissy","Houlahan","PA","Democrat","F",57.0],["Katie","Hill","CA","Democrat","F",37.0],["Sheila","Jackson Lee","TX","Democrat","F",74.0],["Eddie","Johnson","TX","Democrat","F",89.0],["Pramila","Jayapal","WA","Democrat","F",59.0],["Marcy","Kaptur","OH","Democrat","F",78.0],["Ann","Kirkpatrick","AZ","Democrat","F",74.0],["Ann","Kuster","NH","Democrat","F",68.0],["Robin","Kelly","IL","Democrat","F",68.0],["Zoe","Lofgren","CA","Democrat","F",77.0],["Nita","Lowey","NY","Democrat","F",87.0],["Barbara","Lee","CA","Democrat","F",78.0],["Brenda","Lawrence","MI","Democrat","F",70.0],["Susie","Lee","NV","Democrat","F",58.0],["Elaine","Luria","VA","Democrat","F",49.0],["Carolyn","Maloney","NY","Democrat","F",78.0],["Betty","McCollum","MN","Democrat","F",70.0],["Gwen","Moore","WI","Democrat","F",73.0],["Doris","Matsui","CA","Democrat","F",80.0],["Grace","Meng","NY","Democrat","F",49.0],["Stephanie","Murphy","FL","Democrat","F",46.0],["Debbie","Mucarsel-Powell","FL","Democrat","F",53.0],["Lucy","McBath","GA","Democrat","F",64.0],["Eleanor","Norton","DC","Democrat","F",87.0],["Grace","Napolitano","CA","Democrat","F",88.0],["Alexandria","Ocasio-Cortez","NY","Democrat","F",35.0],["Ilhan","Omar","MN","Democrat","F",43.0],["Nancy","Pelosi","CA","Democrat","F",84.0],["Chellie","Pingree","ME","Democrat","F",69.0],["Stacey","Plaskett","VI","Democrat","F",58.0],["Ayanna","Pressley","MA","Democrat","F",50.0],["Katie","Porter","CA","Democrat","F",50.0],["Lucille","Roybal-Allard","CA","Democrat","F",83.0],["Kathleen","Rice","NY","Democrat","F",59.0],["Janice","Schakowsky","IL","Democrat","F",80.0],["Linda","S\u00e1nchez","CA","Democrat","F",55.0],["Jackie","Speier","CA","Democrat","F",74.0],["Terri","Sewell","AL","Democrat","F",59.0],["Mary","Scanlon","PA","Democrat","F",65.0],["Donna","Shalala","FL","Democrat","F",83.0],["Mikie","Sherrill","NJ","Democrat","F",52.0],["Elissa","Slotkin","MI","Democrat","F",48.0],["Abigail","Spanberger","VA","Democrat","F",45.0],["Haley","Stevens","MI","Democrat","F",41.0],["Kim","Schrier","WA","Democrat","F",56.0],["Dina","Titus","NV","Democrat","F",74.0],["Norma","Torres","CA","Democrat","F",59.0],["Rashida","Tlaib","MI","Democrat","F",48.0],["Lori","Trahan","MA","Democrat","F",51.0],["Xochitl","Torres Small","NM","Democrat","F",40.0],["Lauren","Underwood","IL","Democrat","F",38.0],["Nydia","Vel\u00e1zquez","NY","Democrat","F",71.0],["Maxine","Waters","CA","Democrat","F",86.0],["Debbie","Wasserman Schultz","FL","Democrat","F",58.0],["Frederica","Wilson","FL","Democrat","F",82.0],["Bonnie","Watson Coleman","NJ","Democrat","F",79.0],["Jennifer","Wexton","VA","Democrat","F",56.0],["Susan","Wild","PA","Democrat","F",67.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003ez3_jittered=%{z}\u003cbr\u003eage=%{customdata[5]}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Democrat, F","marker":{"color":"blue","opacity":0.7,"size":[78.0,59.0,71.0,70.0,74.0,72.0,63.0,48.0,62.0,58.0,60.0,71.0,61.0,52.0,67.0,81.0,80.0,62.0,71.0,67.0,44.0,65.0,82.0,55.0,72.0,76.0,36.0,49.0,43.0,74.0,64.0,51.0,48.0,57.0,37.0,74.0,89.0,59.0,78.0,74.0,68.0,68.0,77.0,87.0,78.0,70.0,58.0,49.0,78.0,70.0,73.0,80.0,49.0,46.0,53.0,64.0,87.0,88.0,35.0,43.0,84.0,69.0,58.0,50.0,50.0,83.0,59.0,80.0,55.0,74.0,59.0,65.0,83.0,52.0,48.0,45.0,41.0,56.0,74.0,59.0,48.0,51.0,40.0,38.0,71.0,86.0,58.0,82.0,79.0,56.0,67.0],"sizemode":"area","sizeref":0.91,"symbol":"diamond"},"mode":"markers","name":"Democrat, F","scene":"scene","showlegend":true,"x":[-2.4552326791517918,-2.3295219784573264,-2.561299039681855,-2.6007828443238052,-1.9096261483144796,-2.7226350225348286,-2.569965862958032,-2.8035026771805756,-2.533689006793057,-2.6261015624589796,-2.7181691621932007,-2.5071823750033952,-2.5253454143550753,-2.351943136676577,-2.6467464801058136,-2.7538871596058097,-2.58237599773667,-2.6877633917147596,-2.418917074671545,-2.5817806257203215,-2.799412786322502,-2.4189989930861455,-2.62677176127071,-1.6236636575101386,-2.747720771071826,-2.7065892973056664,-2.3335277745877,-2.2423873997258754,-2.0272867019598078,-2.336645956663286,-2.5449689300160974,-2.693251420434226,-2.234932967651761,-2.629881943325183,-2.1749550057238323,-2.3297081405993953,-2.7590203872611845,-2.5690039268197142,-2.72183269457566,-2.5021554595891073,-2.006752353351375,-2.493979100774538,-2.1889478660138377,-2.7942621840517727,-2.7441589305198812,-1.8853847737241494,-2.7275653054981754,-2.469476243770058,-2.423817909726335,-2.5157494700871204,-2.4808445498739355,-2.678302611354476,-2.590108379220323,-2.5453833906266894,-2.6460670244787736,-2.596183930267473,1.0519250559918671,-2.622714174296156,-2.571834116196093,-2.676826624318663,1.3284200478280133,-2.390600700511512,1.2253657156213953,-2.605435280629322,-2.5393403676631014,-2.620549970229366,-2.4220654507318367,-2.5764449082327974,-2.6235295177889406,-2.4330582049900293,-2.465683858296706,-2.6757815683271127,-2.52030360112053,-2.2648460086022446,-2.2245030551250173,-2.364010037711595,-2.592496286144977,-2.5733600672148818,-2.739558985494662,-2.675235371102621,-2.4856871459083205,-2.7669784306720344,-2.275198610943442,-2.652542211940522,-2.545250671557848,-2.7199998739763847,-2.390136174225906,-2.5738839080687477,-2.7834096134349093,-2.6273040171989,-2.5357814392130016],"y":[-0.0787668458439274,0.4201420359822415,-0.22981614945065304,0.13150371935723265,-0.7468780695986303,0.10588040420673792,0.18939403416440231,0.18563306046563643,0.23993485530351752,0.22059511891322395,0.31891902820707285,0.06873797173617288,0.07030023110322653,0.1402373171069841,0.006875744183814161,0.06279375337984389,-0.20773039513581712,0.1809577819515881,0.04828783861301213,0.10767282555576053,0.06757884548847776,0.04689733301550544,0.15341944849018113,-0.38605011534693395,0.21508983450322905,0.08716982685790467,0.33898419606855046,-0.08653765554091009,-0.8265252108693767,0.11648531962838364,0.15036576658797296,0.04892991861552862,0.7142658378186395,0.2625193687755576,-0.1271595740687031,-0.020551059932923544,0.13354985538892927,0.026766926634043037,-0.001018725814045579,0.10201822902369934,-0.2823699078566472,0.3128729736545943,-0.22290685429358711,0.07061116347203618,-0.1725501677344344,-0.5127818602366916,0.1769148202828516,0.17449517903656422,-0.12796014250397195,0.09188074757815182,0.09702272022866078,0.1605906663842716,-0.5774702675069762,0.3732288365929669,0.09161614293867422,0.12228280178795306,-3.5161090725064272,0.1095790305247919,-0.04228099626372425,0.23089292789262417,-3.43492651792467,-0.549765046828156,-3.6738617260040436,0.0330374752768655,0.39121557064379886,0.04214158614193847,0.00761305423126038,-0.0381187291915393,0.15513524787959715,-0.35365728041732336,-0.31564368919930946,-0.028510489160968183,-0.0205786731436374,0.5527684147580619,0.4096886067357797,0.5342917870986785,0.08342711201998262,0.2733158118718299,0.03611216951163998,-0.05987897010466056,-0.01554571952737717,0.10657847973750892,0.569527770140119,0.16621833877846487,0.1446225849813654,0.06566331151758337,0.19905638848638857,-0.47283501713588083,0.19383179142734286,-0.1703077862028576,-0.009954558641392913],"z":[-0.021709836232698086,-0.03950363457463327,-0.12494789253583001,0.06824437194662433,-0.20819192217001667,0.05048694786132449,0.03602069587494947,0.05927919363636819,-0.013266320395718498,-0.025030822014338545,-0.09665339880775516,-0.07563143557328096,0.17575350146666024,-0.06268922554556623,0.11544139595820672,-0.033885356870904826,0.039286172731224145,0.07392279723783077,0.031016787150277454,0.18171687123604935,0.009280623676367901,-0.07684332425185147,0.0066593028072746,-0.531636969893913,0.12384040935061118,0.1011993724206933,-0.03505193747566766,0.4041781428996451,-0.33522575067797955,0.3581516894084273,0.08573705651877753,-0.005684392149276303,-0.3834233797068722,0.14625361109654467,-0.4177725823099059,-0.31650178083206815,0.14745971979718622,-0.09692218135952267,-0.011434475037256961,-0.003548729372508587,-0.39876406512010715,0.011742056150836469,-0.12957452752487453,0.06602588864913356,-0.20510873316323713,0.05474995019334565,0.09525006326225276,-0.1128077762473081,-0.05588978005304497,0.06425328208968237,0.006961377530269416,0.12778575788817007,0.8831511008935368,-0.077372595236637,0.2566206215937652,0.12955358975849374,-0.7927072185968436,0.13363408426029832,0.041797151454521904,0.025117728269382054,-0.7974630253206929,0.8885828706617303,-0.5731621339986992,0.063355234473588,-0.12792233229383265,0.0866260365010491,-0.08721850655023583,0.12776495273882527,-0.10560325689857351,1.0383635369540265,0.026026920283119374,-0.03787637720246856,0.1541439403066635,-0.21953321951298516,0.11354158256692679,-0.05666721670726009,-0.06069031824860234,0.0483131234629421,0.1405080531187612,0.2606798116696555,-0.02227626037961001,0.05058195270833746,-0.03000665602305263,0.10218790872819335,0.1270472727387927,-0.18391201823591402,-0.06168189083781541,0.1859543488472168,0.030465141394913943,-0.08865602245146269,0.113153670979363],"type":"scatter3d"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"scene":{"domain":{"x":[0.0,1.0],"y":[0.0,1.0]},"xaxis":{"title":{"text":"z1_jittered"}},"yaxis":{"title":{"text":"z2_jittered"}},"zaxis":{"title":{"text":"z3_jittered"}}},"legend":{"title":{"text":"party, gender"},"tracegroupgap":0,"itemsizing":"constant"},"title":{"text":"Vote Data"},"height":600,"width":800},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('bc0276b3-210d-4d70-a9b8-bd06b7f14cba');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>Using SVD and PCA, we can clearly see a separation between the red dots (Republican) and blue dots (Deomcrat).</p>
+<!-- ### Analysis: Regular Voters
+
+Not everyone voted all the time. Let's examine the frequency of voting.
+First, let's recompute the pivot table where we only consider Yes/No votes, and ignore records with "No Vote" or other entries.
+
+#| code-fold: true
+
+vote_2d["num votes"] = votes[votes["vote"].isin(["Yes", "No"])].groupby("member").size()
+vote_2d.dropna(inplace=True)
+vote_2d.head()
+``` -->
+</section>
+<section id="exploring-the-principal-components" class="level3" data-number="25.6.2">
+<h3 data-number="25.6.2" class="anchored" data-anchor-id="exploring-the-principal-components"><span class="header-section-number">25.6.2</span> Exploring the Principal Components</h3>
+<p>We can also look at <span class="math inline">\(V^T\)</span> directly to try to gain insight into why each component is as it is.</p>
+<div id="5f0fdd86" class="cell" data-execution_count="20">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>fig_eig <span class="op">=</span> px.bar(x<span class="op">=</span>vote_pivot_centered.columns, y<span class="op">=</span>vt[<span class="dv">0</span>, :])</span>
+<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a><span class="co"># extract the trace from the figure</span></span>
+<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a>fig_eig.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="5c1f7c15-4bdc-4bec-9401-05f8d116da77" class="plotly-graph-div" style="height:525px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("5c1f7c15-4bdc-4bec-9401-05f8d116da77")) {                    Plotly.newPlot(                        "5c1f7c15-4bdc-4bec-9401-05f8d116da77",                        [{"alignmentgroup":"True","hovertemplate":"x=%{x}\u003cbr\u003ey=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","marker":{"color":"#636efa","pattern":{"shape":""}},"name":"","offsetgroup":"","orientation":"v","showlegend":false,"textposition":"auto","x":["515","516","517","518","519","520","521","522","523","524","525","526","527","528","529","530","531","532","533","534","535","536","537","538","539","540","541","542","543","544","545","546","547","548","549","550","551","552","553","554","555"],"xaxis":"x","y":[-0.028833020895914406,-0.11337258287284613,-0.18416808936872886,-0.18362782533433428,-0.005223992786976023,0.17220375939555346,-0.167340197699103,0.15387410489614367,0.1698402094812272,0.1781707211649408,-0.17498465223284454,0.17397850899624392,0.17174664337398382,0.1732296655777088,0.17549736584323067,-0.17788975824138686,0.17150954460663384,0.0019331123130682248,-0.18224300245729474,-0.18352844295094844,-0.007172331038153449,-0.1837279145279273,-0.1835709863217938,-0.11563949383042026,0.15037380121279154,-0.179993245287693,-0.024648321391682253,-0.1825136507123496,-0.18306103468881535,-0.10361374659906494,0.16405231634581444,-0.18510958602551703,-0.18517762091779186,-0.01248441285441112,-0.18324527137361873,-0.18270580190004515,0.16030313927070328,-0.18154702108056225,-0.1740586675481475,-0.008488852754290172,-0.17863308935711883],"yaxis":"y","type":"bar"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"x"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"y"}},"legend":{"tracegroupgap":0},"barmode":"relative"},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('5c1f7c15-4bdc-4bec-9401-05f8d116da77');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>We have the party affiliation labels so we can see if this eigenvector aligns with one of the parties.</p>
+<div id="f4c6e345" class="cell" data-execution_count="21">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>party_line_votes <span class="op">=</span> (</span>
+<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>    vote_pivot_centered.join(legs.set_index(<span class="st">"leg_id"</span>)[<span class="st">"party"</span>])</span>
+<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a>    .groupby(<span class="st">"party"</span>)</span>
+<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a>    .mean()</span>
+<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a>    .T.reset_index()</span>
+<span id="cb27-6"><a href="#cb27-6" aria-hidden="true" tabindex="-1"></a>    .rename(columns<span class="op">=</span>{<span class="st">"index"</span>: <span class="st">"call"</span>})</span>
+<span id="cb27-7"><a href="#cb27-7" aria-hidden="true" tabindex="-1"></a>    .melt(<span class="st">"call"</span>)</span>
+<span id="cb27-8"><a href="#cb27-8" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb27-9"><a href="#cb27-9" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.bar(</span>
+<span id="cb27-10"><a href="#cb27-10" aria-hidden="true" tabindex="-1"></a>    party_line_votes,</span>
+<span id="cb27-11"><a href="#cb27-11" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">"call"</span>, y<span class="op">=</span><span class="st">"value"</span>, facet_row <span class="op">=</span> <span class="st">"party"</span>, color<span class="op">=</span><span class="st">"party"</span>,</span>
+<span id="cb27-12"><a href="#cb27-12" aria-hidden="true" tabindex="-1"></a>    color_discrete_map<span class="op">=</span>{<span class="st">'Democrat'</span>:<span class="st">'blue'</span>, <span class="st">'Republican'</span>:<span class="st">'red'</span>, <span class="st">"Independent"</span>: <span class="st">"green"</span>})</span>
+<span id="cb27-13"><a href="#cb27-13" aria-hidden="true" tabindex="-1"></a>fig.for_each_annotation(<span class="kw">lambda</span> a: a.update(text<span class="op">=</span>a.text.split(<span class="st">"="</span>)[<span class="op">-</span><span class="dv">1</span>]))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="4045ddd2-8459-4086-a46a-86a51f414ea0" class="plotly-graph-div" style="height:525px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("4045ddd2-8459-4086-a46a-86a51f414ea0")) {                    Plotly.newPlot(                        "4045ddd2-8459-4086-a46a-86a51f414ea0",                        [{"alignmentgroup":"True","hovertemplate":"party=Democrat\u003cbr\u003ecall=%{x}\u003cbr\u003evalue=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Democrat","marker":{"color":"blue","pattern":{"shape":""}},"name":"Democrat","offsetgroup":"Democrat","orientation":"v","showlegend":true,"textposition":"auto","x":["515","516","517","518","519","520","521","522","523","524","525","526","527","528","529","530","531","532","533","534","535","536","537","538","539","540","541","542","543","544","545","546","547","548","549","550","551","552","553","554","555"],"xaxis":"x3","y":[0.04975379273048134,0.24738375126898737,0.4404500991470508,0.4385335724247858,-0.000322583705727777,-0.4126984126984127,0.3832484179166785,-0.36507936507936506,-0.40589569160997735,-0.43572519663374415,0.40592415487812983,-0.42090532168236894,-0.41304945967229295,-0.4156680803423183,-0.42857142857142855,0.4144821108359662,-0.41969088890786443,-0.01913680395449671,0.4327839922579911,0.43696809267640113,2.846326815239866e-05,0.43696809267640113,0.4350515659541362,0.25888291160257687,-0.36507936507936506,0.42285031167278625,0.0438144574426702,0.4350515659541362,0.43696809267640113,0.23026783935331452,-0.40244214840748016,0.44080114612093096,0.4404500991470508,0.011878670575622174,0.4366170457025209,0.434700518980256,-0.3911042799267545,0.42824884486570086,0.40209110143359994,0.00248579208531385,0.4212848319244016],"yaxis":"y3","type":"bar"},{"alignmentgroup":"True","hovertemplate":"party=Independent\u003cbr\u003ecall=%{x}\u003cbr\u003evalue=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Independent","marker":{"color":"green","pattern":{"shape":""}},"name":"Independent","offsetgroup":"Independent","orientation":"v","showlegend":true,"textposition":"auto","x":["515","516","517","518","519","520","521","522","523","524","525","526","527","528","529","530","531","532","533","534","535","536","537","538","539","540","541","542","543","544","545","546","547","548","549","550","551","552","553","554","555"],"xaxis":"x2","y":[-0.8707482993197279,-0.6689342403628118,-0.5260770975056689,-0.5238095238095238,-0.9501133786848073,-0.4126984126984127,-0.562358276643991,-0.36507936507936506,-0.40589569160997735,-0.4399092970521542,-0.5396825396825397,-0.4376417233560091,-0.41723356009070295,-0.42403628117913833,-0.42857142857142855,-0.5102040816326531,-0.4489795918367347,-0.9229024943310657,-0.5170068027210885,-0.5170068027210885,0.05442176870748294,-0.5170068027210885,-0.5147392290249433,-0.6825396825396826,0.6349206349206349,-0.5102040816326531,-0.8934240362811792,-0.5147392290249433,-0.5170068027210885,0.27210884353741494,-0.46938775510204084,-0.5215419501133787,0.4739229024943311,0.04535147392290251,0.47845804988662133,0.4807256235827665,-0.4580498866213152,0.47845804988662133,0.46485260770975056,-0.9138321995464853,0.4965986394557823],"yaxis":"y2","type":"bar"},{"alignmentgroup":"True","hovertemplate":"party=Republican\u003cbr\u003ecall=%{x}\u003cbr\u003evalue=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Republican","marker":{"color":"red","pattern":{"shape":""}},"name":"Republican","offsetgroup":"Republican","orientation":"v","showlegend":true,"textposition":"auto","x":["515","516","517","518","519","520","521","522","523","524","525","526","527","528","529","530","531","532","533","534","535","536","537","538","539","540","541","542","543","544","545","546","547","548","549","550","551","552","553","554","555"],"xaxis":"x","y":[-0.05667794756093389,-0.292049818252259,-0.5210519718775282,-0.5187843981813831,0.004660490661926394,0.49182420036691393,-0.4518055128248955,0.43894073542314743,0.48355154457092714,0.5198896979227201,-0.4793810321448512,0.5121070203625839,0.5023644298590458,0.5005868343987512,0.5161521895190236,-0.4901035791200903,0.5007691518818582,0.026846249387527214,-0.5119816770929477,-0.5170068027210885,-0.000854613202064796,-0.5170068027210885,-0.5147392290249433,-0.30565526042912977,0.43391560979500676,-0.5001538303763717,-0.044177805125400274,-0.5147392290249433,-0.5170068027210885,-0.27562984992992173,0.48538611424469275,-0.5215419501133787,-0.5260770975056689,-0.009924907986645231,-0.5215419501133787,-0.5192743764172335,0.471598354584715,-0.5114916988570972,-0.4798710103807017,0.00576579040326343,-0.5034013605442177],"yaxis":"y","type":"bar"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,0.98],"title":{"text":"call"}},"yaxis":{"anchor":"x","domain":[0.0,0.3133333333333333],"title":{"text":"value"}},"xaxis2":{"anchor":"y2","domain":[0.0,0.98],"matches":"x","showticklabels":false},"yaxis2":{"anchor":"x2","domain":[0.34333333333333327,0.6566666666666665],"matches":"y","title":{"text":"value"}},"xaxis3":{"anchor":"y3","domain":[0.0,0.98],"matches":"x","showticklabels":false},"yaxis3":{"anchor":"x3","domain":[0.6866666666666665,0.9999999999999998],"matches":"y","title":{"text":"value"}},"annotations":[{"font":{},"showarrow":false,"text":"Republican","textangle":90,"x":0.98,"xanchor":"left","xref":"paper","y":0.15666666666666665,"yanchor":"middle","yref":"paper"},{"font":{},"showarrow":false,"text":"Independent","textangle":90,"x":0.98,"xanchor":"left","xref":"paper","y":0.4999999999999999,"yanchor":"middle","yref":"paper"},{"font":{},"showarrow":false,"text":"Democrat","textangle":90,"x":0.98,"xanchor":"left","xref":"paper","y":0.8433333333333332,"yanchor":"middle","yref":"paper"}],"legend":{"title":{"text":"party"},"tracegroupgap":0},"barmode":"relative"},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('4045ddd2-8459-4086-a46a-86a51f414ea0');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+</section>
+<section id="biplot" class="level3" data-number="25.6.3">
+<h3 data-number="25.6.3" class="anchored" data-anchor-id="biplot"><span class="header-section-number">25.6.3</span> Biplot</h3>
+<div id="8b3f9e57" class="cell" data-execution_count="22">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a>loadings <span class="op">=</span> pd.DataFrame(</span>
+<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>    {<span class="st">"pc1"</span>: np.sqrt(s[<span class="dv">0</span>]) <span class="op">*</span> vt[<span class="dv">0</span>, :], <span class="st">"pc2"</span>: np.sqrt(s[<span class="dv">1</span>]) <span class="op">*</span> vt[<span class="dv">1</span>, :]},</span>
+<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a>    index<span class="op">=</span>vote_pivot_centered.columns,</span>
+<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb28-5"><a href="#cb28-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb28-6"><a href="#cb28-6" aria-hidden="true" tabindex="-1"></a>vote_2d[<span class="st">"num votes"</span>] <span class="op">=</span> votes[votes[<span class="st">"vote"</span>].isin([<span class="st">"Yes"</span>, <span class="st">"No"</span>])].groupby(<span class="st">"member"</span>).size()</span>
+<span id="cb28-7"><a href="#cb28-7" aria-hidden="true" tabindex="-1"></a>vote_2d.dropna(inplace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb28-8"><a href="#cb28-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb28-9"><a href="#cb28-9" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.scatter(</span>
+<span id="cb28-10"><a href="#cb28-10" aria-hidden="true" tabindex="-1"></a>    vote_2d, </span>
+<span id="cb28-11"><a href="#cb28-11" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">'z1_jittered'</span>, </span>
+<span id="cb28-12"><a href="#cb28-12" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">'z2_jittered'</span>, </span>
+<span id="cb28-13"><a href="#cb28-13" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'party'</span>, </span>
+<span id="cb28-14"><a href="#cb28-14" aria-hidden="true" tabindex="-1"></a>    symbol<span class="op">=</span><span class="st">"gender"</span>, </span>
+<span id="cb28-15"><a href="#cb28-15" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="st">'num votes'</span>,</span>
+<span id="cb28-16"><a href="#cb28-16" aria-hidden="true" tabindex="-1"></a>    title<span class="op">=</span><span class="st">'Biplot'</span>, </span>
+<span id="cb28-17"><a href="#cb28-17" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">800</span>, </span>
+<span id="cb28-18"><a href="#cb28-18" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">600</span>, </span>
+<span id="cb28-19"><a href="#cb28-19" aria-hidden="true" tabindex="-1"></a>    size_max<span class="op">=</span><span class="dv">10</span>,</span>
+<span id="cb28-20"><a href="#cb28-20" aria-hidden="true" tabindex="-1"></a>    opacity <span class="op">=</span> <span class="fl">0.7</span>,</span>
+<span id="cb28-21"><a href="#cb28-21" aria-hidden="true" tabindex="-1"></a>    color_discrete_map<span class="op">=</span>{<span class="st">'Democrat'</span>:<span class="st">'blue'</span>, <span class="st">'Republican'</span>:<span class="st">'red'</span>, <span class="st">"Independent"</span>: <span class="st">"green"</span>},</span>
+<span id="cb28-22"><a href="#cb28-22" aria-hidden="true" tabindex="-1"></a>    hover_data<span class="op">=</span>[<span class="st">'first'</span>, <span class="st">'last'</span>, <span class="st">'state'</span>, <span class="st">'party'</span>, <span class="st">'gender'</span>, <span class="st">'age'</span>])</span>
+<span id="cb28-23"><a href="#cb28-23" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb28-24"><a href="#cb28-24" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (call, pc1, pc2) <span class="kw">in</span> loadings.head(<span class="dv">20</span>).itertuples():</span>
+<span id="cb28-25"><a href="#cb28-25" aria-hidden="true" tabindex="-1"></a>    fig.add_scatter(x<span class="op">=</span>[<span class="dv">0</span>,pc1], y<span class="op">=</span>[<span class="dv">0</span>,pc2], name<span class="op">=</span>call, </span>
+<span id="cb28-26"><a href="#cb28-26" aria-hidden="true" tabindex="-1"></a>                    mode<span class="op">=</span><span class="st">'lines+markers'</span>, textposition<span class="op">=</span><span class="st">'top right'</span>,</span>
+<span id="cb28-27"><a href="#cb28-27" aria-hidden="true" tabindex="-1"></a>                    marker<span class="op">=</span> <span class="bu">dict</span>(size<span class="op">=</span><span class="dv">10</span>,symbol<span class="op">=</span> <span class="st">"arrow-bar-up"</span>, angleref<span class="op">=</span><span class="st">"previous"</span>))</span>
+<span id="cb28-28"><a href="#cb28-28" aria-hidden="true" tabindex="-1"></a>fig</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="55ae8e5d-ffb6-4f08-9443-a535fff5ce66" class="plotly-graph-div" style="height:600px; width:800px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("55ae8e5d-ffb6-4f08-9443-a535fff5ce66")) {                    Plotly.newPlot(                        "55ae8e5d-ffb6-4f08-9443-a535fff5ce66",                        [{"customdata":[["Robert","Aderholt","AL","Republican","M",59.0],["Mark","Amodei","NV","Republican","M",66.0],["Rick","Allen","GA","Republican","M",73.0],["Jodey","Arrington","TX","Republican","M",52.0],["Kelly","Armstrong","ND","Republican","M",48.0],["Kevin","Brady","TX","Republican","M",69.0],["Michael","Burgess","TX","Republican","M",74.0],["Rob","Bishop","UT","Republican","M",73.0],["Gus","Bilirakis","FL","Republican","M",61.0],["Vern","Buchanan","FL","Republican","M",73.0],["Mo","Brooks","AL","Republican","M",70.0],["Larry","Bucshon","IN","Republican","M",62.0],["Garland","Barr","KY","Republican","M",51.0],["Bradley","Byrne","AL","Republican","M",69.0],["Brian","Babin","TX","Republican","M",76.0],["Mike","Bost","IL","Republican","M",64.0],["Ken","Buck","CO","Republican","M",65.0],["Don","Bacon","NE","Republican","M",61.0],["Jim","Banks","IN","Republican","M",45.0],["Jack","Bergman","MI","Republican","M",77.0],["Andy","Biggs","AZ","Republican","M",66.0],["Ted","Budd","NC","Republican","M",53.0],["Troy","Balderson","OH","Republican","M",62.0],["James","Baird","IN","Republican","M",79.0],["Tim","Burchett","TN","Republican","M",60.0],["Dan","Bishop","NC","Republican","M",60.0],["Ken","Calvert","CA","Republican","M",71.0],["Steve","Chabot","OH","Republican","M",71.0],["John","Carter","TX","Republican","M",83.0],["Tom","Cole","OK","Republican","M",75.0],["K.","Conaway","TX","Republican","M",76.0],["Eric","Crawford","AR","Republican","M",58.0],["Doug","Collins","GA","Republican","M",58.0],["Paul","Cook","CA","Republican","M",81.0],["Buddy","Carter","GA","Republican","M",67.0],["James","Comer","KY","Republican","M",52.0],["John","Curtis","UT","Republican","M",64.0],["Michael","Cloud","TX","Republican","M",49.0],["Ben","Cline","VA","Republican","M",52.0],["Dan","Crenshaw","TX","Republican","M",40.0],["Mario","Diaz-Balart","FL","Republican","M",63.0],["Jeff","Duncan","SC","Republican","M",58.0],["Scott","DesJarlais","TN","Republican","M",60.0],["Rodney","Davis","IL","Republican","M",54.0],["Warren","Davidson","OH","Republican","M",54.0],["Neal","Dunn","FL","Republican","M",71.0],["Tom","Emmer","MN","Republican","M",63.0],["Ron","Estes","KS","Republican","M",68.0],["Jeff","Fortenberry","NE","Republican","M",64.0],["Charles","Fleischmann","TN","Republican","M",62.0],["Bill","Flores","TX","Republican","M",70.0],["A.","Ferguson","GA","Republican","M",58.0],["Brian","Fitzpatrick","PA","Republican","M",51.0],["Russ","Fulcher","ID","Republican","M",51.0],["Sam","Graves","MO","Republican","M",61.0],["Louie","Gohmert","TX","Republican","M",71.0],["Brett","Guthrie","KY","Republican","M",60.0],["Tom","Graves","GA","Republican","M",54.0],["Bob","Gibbs","OH","Republican","M",70.0],["Paul","Gosar","AZ","Republican","M",66.0],["H.","Griffith","VA","Republican","M",66.0],["Glenn","Grothman","WI","Republican","M",69.0],["Garret","Graves","LA","Republican","M",52.0],["Matt","Gaetz","FL","Republican","M",42.0],["Mike","Gallagher","WI","Republican","M",40.0],["Greg","Gianforte","MT","Republican","M",63.0],["Anthony","Gonzalez","OH","Republican","M",40.0],["Lance","Gooden","TX","Republican","M",42.0],["Mark","Green","TN","Republican","M",60.0],["Michael","Guest","MS","Republican","M",54.0],["Duncan","Hunter","CA","Republican","M",48.0],["Andy","Harris","MD","Republican","M",67.0],["Bill","Huizenga","MI","Republican","M",55.0],["George","Holding","NC","Republican","M",56.0],["Richard","Hudson","NC","Republican","M",53.0],["Jody","Hice","GA","Republican","M",64.0],["French","Hill","AR","Republican","M",68.0],["Will","Hurd","TX","Republican","M",47.0],["Trey","Hollingsworth","IN","Republican","M",41.0],["Clay","Higgins","LA","Republican","M",63.0],["Kevin","Hern","OK","Republican","M",63.0],["Jim","Hagedorn","MN","Republican","M",62.0],["Jim","Jordan","OH","Republican","M",60.0],["Bill","Johnson","OH","Republican","M",70.0],["David","Joyce","OH","Republican","M",67.0],["Mike","Johnson","LA","Republican","M",52.0],["Dusty","Johnson","SD","Republican","M",48.0],["John","Joyce","PA","Republican","M",67.0],["Peter","King","NY","Republican","M",80.0],["Steve","King","IA","Republican","M",75.0],["Mike","Kelly","PA","Republican","M",76.0],["Adam","Kinzinger","IL","Republican","M",46.0],["John","Katko","NY","Republican","M",62.0],["Trent","Kelly","MS","Republican","M",58.0],["David","Kustoff","TN","Republican","M",58.0],["Fred","Keller","PA","Republican","M",59.0],["Frank","Lucas","OK","Republican","M",64.0],["Doug","Lamborn","CO","Republican","M",70.0],["Robert","Latta","OH","Republican","M",68.0],["Blaine","Luetkemeyer","MO","Republican","M",72.0],["Billy","Long","MO","Republican","M",69.0],["Doug","LaMalfa","CA","Republican","M",64.0],["Barry","Loudermilk","GA","Republican","M",61.0],["Darin","LaHood","IL","Republican","M",56.0],["Patrick","McHenry","NC","Republican","M",49.0],["Michael","McCaul","TX","Republican","M",62.0],["Kenny","Marchant","TX","Republican","M",73.0],["Kevin","McCarthy","CA","Republican","M",59.0],["Tom","McClintock","CA","Republican","M",68.0],["David","McKinley","WV","Republican","M",77.0],["Thomas","Massie","KY","Republican","M",53.0],["Mark","Meadows","NC","Republican","M",65.0],["Markwayne","Mullin","OK","Republican","M",47.0],["John","Moolenaar","MI","Republican","M",63.0],["Alex","Mooney","WV","Republican","M",53.0],["Roger","Marshall","KS","Republican","M",64.0],["Brian","Mast","FL","Republican","M",44.0],["Paul","Mitchell","MI","Republican","M",63.0],["Daniel","Meuser","PA","Republican","M",60.0],["Gregory","Murphy","NC","Republican","M",61.0],["Devin","Nunes","CA","Republican","M",51.0],["Dan","Newhouse","WA","Republican","M",69.0],["Ralph","Norman","SC","Republican","M",71.0],["Pete","Olson","TX","Republican","M",62.0],["Bill","Posey","FL","Republican","M",77.0],["Steven","Palazzo","MS","Republican","M",54.0],["Scott","Perry","PA","Republican","M",62.0],["Gary","Palmer","AL","Republican","M",70.0],["Greg","Pence","IN","Republican","M",68.0],["Harold","Rogers","KY","Republican","M",87.0],["Mike","Rogers","AL","Republican","M",66.0],["David","Roe","TN","Republican","M",79.0],["Tom","Reed","NY","Republican","M",53.0],["Tom","Rice","SC","Republican","M",67.0],["John","Ratcliffe","TX","Republican","M",59.0],["David","Rouzer","NC","Republican","M",52.0],["Francis","Rooney","FL","Republican","M",71.0],["John","Rutherford","FL","Republican","M",72.0],["Guy","Reschenthaler","PA","Republican","M",41.0],["Denver","Riggleman","VA","Republican","M",54.0],["John","Rose","TN","Republican","M",59.0],["Chip","Roy","TX","Republican","M",52.0],["F.","Sensenbrenner","WI","Republican","M",81.0],["John","Shimkus","IL","Republican","M",66.0],["Christopher","Smith","NJ","Republican","M",71.0],["Michael","Simpson","ID","Republican","M",74.0],["Adrian","Smith","NE","Republican","M",54.0],["Steve","Scalise","LA","Republican","M",59.0],["David","Schweikert","AZ","Republican","M",62.0],["Steve","Stivers","OH","Republican","M",59.0],["Austin","Scott","GA","Republican","M",55.0],["Chris","Stewart","UT","Republican","M",64.0],["Jason","Smith","MO","Republican","M",44.0],["Lloyd","Smucker","PA","Republican","M",60.0],["Ross","Spano","FL","Republican","M",58.0],["Pete","Stauber","MN","Republican","M",58.0],["Bryan","Steil","WI","Republican","M",43.0],["W.","Steube","FL","Republican","M",46.0],["Mac","Thornberry","TX","Republican","M",66.0],["Michael","Turner","OH","Republican","M",64.0],["Glenn","Thompson","PA","Republican","M",65.0],["Scott","Tipton","CO","Republican","M",68.0],["Van","Taylor","TX","Republican","M",52.0],["William","Timmons","SC","Republican","M",40.0],["Fred","Upton","MI","Republican","M",71.0],["Greg","Walden","OR","Republican","M",67.0],["Joe","Wilson","SC","Republican","M",77.0],["Tim","Walberg","MI","Republican","M",73.0],["Robert","Wittman","VA","Republican","M",65.0],["Daniel","Webster","FL","Republican","M",75.0],["Steve","Womack","AR","Republican","M",67.0],["Rob","Woodall","GA","Republican","M",54.0],["Randy","Weber","TX","Republican","M",71.0],["Brad","Wenstrup","OH","Republican","M",66.0],["Roger","Williams","TX","Republican","M",75.0],["Mark","Walker","NC","Republican","M",55.0],["Bruce","Westerman","AR","Republican","M",57.0],["Michael","Waltz","FL","Republican","M",50.0],["Steven","Watkins","KS","Republican","M",48.0],["Ron","Wright","TX","Republican","M",71.0],["Don","Young","AK","Republican","M",91.0],["Ted","Yoho","FL","Republican","M",69.0],["Lee","Zeldin","NY","Republican","M",44.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003enum votes=%{marker.size}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cbr\u003eage=%{customdata[5]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Republican, M","marker":{"color":"red","opacity":0.7,"size":[40.0,41.0,41.0,39.0,40.0,40.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,39.0,38.0,41.0,39.0,41.0,41.0,39.0,41.0,41.0,40.0,40.0,41.0,24.0,41.0,41.0,41.0,41.0,41.0,18.0,39.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,39.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,40.0,41.0,40.0,41.0,41.0,41.0,41.0,41.0,40.0,39.0,41.0,36.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,35.0,41.0,39.0,41.0,41.0,34.0,41.0,26.0,41.0,39.0,39.0,41.0,40.0,34.0,41.0,39.0,39.0,41.0,41.0,41.0,41.0,40.0,41.0,39.0,41.0,39.0,41.0,41.0,41.0,39.0,41.0,41.0,34.0,40.0,31.0,39.0,41.0,41.0,40.0,39.0,41.0,41.0,41.0,34.0,36.0,41.0,41.0,23.0,40.0,41.0,34.0,41.0,39.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,35.0,41.0,33.0,39.0,36.0,41.0,41.0,41.0,41.0,41.0,41.0,38.0,39.0,41.0,40.0,41.0,40.0,40.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,36.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,40.0,41.0,35.0,41.0,41.0,39.0,41.0,41.0,36.0,41.0,41.0,41.0,33.0,41.0,36.0,41.0],"sizemode":"area","sizeref":0.41,"symbol":"circle"},"mode":"markers","name":"Republican, M","orientation":"v","showlegend":true,"x":[3.111027718171449,2.9091388272084986,3.1535821010058314,3.0822297250093995,2.8603913666912812,2.8700282784045417,2.891131430576587,2.9015138033471595,2.429522764657823,2.0189383887404793,3.273428269281422,3.041045583779239,2.8650696727530143,3.201307764009005,2.966451334265011,2.6926464497118214,3.1141004794852436,2.4180377022923696,3.094268409602153,2.8938033905981086,3.2553872553622205,3.165430968461919,3.043271680542748,2.8121572681920837,3.184413261892354,1.8013628310615322,2.982345548954318,3.0006917811644698,3.1644562551199176,2.9374979853100265,3.1589108155825585,2.690548482449458,2.9829929732561693,2.9540742843660803,3.0700610096941396,3.0723143359509977,3.2211714545758285,3.149817096265061,3.1803014346298792,2.9928845781947953,1.919790729919587,3.134931264424207,3.1512131978023565,2.6778676010190137,3.119672852280768,2.7731989307217515,2.9531629831222466,3.187664331553179,3.0270869147017763,3.0164282656512844,3.0634155359794044,2.7472386784148357,0.8132681201714701,3.2126851150772526,3.2243550397642613,3.397954992344498,3.0314170622073235,2.834656124627862,3.09612804356209,3.3611914144165755,2.7622599254327627,2.871872527103174,2.939928142740083,1.4493139133055952,2.642788781277922,2.8799812478396225,2.8805736233000903,3.149598363088763,2.954251700381412,2.82276375762477,3.114660142795327,3.288034833596099,2.4170961891242553,3.0954147682041993,3.239944015078314,3.284362442996028,2.8051686405629037,2.9415228411967895,2.437344829076938,2.781491376657692,2.9951258211914733,2.97033201134624,3.2914686970289972,2.865779347811427,2.723798677625144,2.8204368993705797,2.350644922600238,3.069444085672491,1.9118317011175643,3.196576659537101,2.8662269632373145,2.5190952238601194,1.607103100563066,3.1372511917127617,2.984073781416565,2.874471894105975,3.016609739034923,3.2737885924901176,3.1986416556589403,2.848927157388788,3.0587643823309394,2.9535724591399912,3.1586253835812994,2.984371139984621,2.6871707505870734,2.875732880209355,2.1887216678556722,3.0309575199817873,2.9806890029438686,2.8014162194522867,2.6444532841625326,3.1805720024573336,3.3402369270939065,2.918342165074259,3.061198253861815,3.1695552221159504,1.488306535733959,3.074405356203636,2.9776485258286045,1.8542522900541758,2.915686106901275,2.895472604289198,2.9254038599412198,3.014445574407256,3.099525646440894,3.059266143473908,3.229981560576427,3.0029024265612088,3.0293346835975354,2.980400355246316,2.903885675942098,2.7437682352484365,2.5142382957410154,2.9819199269107486,2.9426284040738997,2.9842385415492942,0.36156275169248175,2.140563659486198,2.661857588020315,3.1393235169209226,3.004983847589925,3.2548458601375243,2.6287295281169087,2.772503035177445,0.7055832591103629,2.9919055950933617,3.0347573499358775,2.9786332085150904,3.0625060352559914,2.920556703607671,3.1368737495121595,3.1782550367789226,3.0996948183151964,3.009809296353372,2.733345406703327,2.597246093585055,2.487546565121396,2.3876918547618686,3.272038180519348,2.665055574941156,3.006934473421908,3.1026018006926375,3.2376023084141257,2.9080539606802716,2.3395882392500273,2.363025783880553,3.3047634788905937,3.1267954644367704,3.075190677257177,2.990148012422382,3.1638173487594563,3.123517804241792,3.1199926113423735,3.3138589524539777,2.926629345746588,3.0833646911168144,3.282838245385702,2.345484661134156,2.869458832325667,3.229144940706533,2.701405650227592,2.2749767341719034,2.464312769847675],"xaxis":"x","y":[0.3586365064792049,0.8183495700597335,-0.01406004508536178,-0.015683866190894284,0.39062766320927356,0.2656186717353133,0.41294151749880215,0.5224433220499982,-0.013547540240971144,0.20010251022730796,-0.3930100349810374,0.5590093792386568,0.21726238503446518,-0.5375501680015754,-0.04963855703831399,0.4829832904566167,-0.0742402241881206,0.7458718597578547,0.3027611085278533,0.03906828248050895,-0.39548289035372897,0.020832860905076814,0.16702008045773445,0.594161037052112,-0.6267254242002904,-2.158569472915371,0.5021864369893273,0.2851010684723759,0.27806285623604254,0.9739314525961051,0.27662945103039316,-1.3733412454063008,0.2668105196095797,0.47800279472516205,0.4397303989514445,0.0741216779131818,0.38200073172148524,-0.24712767580235956,-0.5608259688947747,0.41106042967707346,0.04756752495158258,0.19995611617163359,-0.2562560877685339,0.8573322689959654,0.08173717186879773,0.7023805779125014,0.2529813407520479,0.2844088937015296,0.5682427892687405,0.5428667306704573,0.9417404370855339,0.23985631184700731,0.014308588465396163,-0.1854060104486735,0.14907964234153567,-0.6142302082881791,0.38609832175787395,0.42761045930925723,0.44204011680418465,-0.45065041249108156,0.17769228315444757,0.4009075463797147,0.38735892457705096,0.21819075986363307,-0.26433330456289883,0.5625754926742552,0.6621737670447007,-0.002957534378593401,0.22978834090526096,0.07951799389984524,0.3690082028058618,0.0004317328881779381,-0.7380560257239517,0.18182347060052517,-0.15925098573564664,0.06370989805390775,0.4565908835623155,0.1562705485456649,0.6298370562902764,-1.3009221699913396,0.0711767948643433,0.16809950690140915,-0.3564943113391106,0.9745740768152008,0.45781658676942794,-1.2601037472862722,-0.30662705514947936,0.3698686626658577,0.3728824421196423,0.23218681360306223,0.5492403670567954,0.1479384632351214,0.2170796596122878,0.07597568962364279,0.3701131197703177,0.21680856583352737,0.5556553432267276,-0.45234482920692987,0.2409954242148823,0.6142800216283244,0.7844749582695603,-0.18890022282555916,0.4212325517982378,0.4896923754464332,0.18438848412375497,0.33813605509524036,-0.8330171448733403,0.1467229729159631,0.3091744988391928,0.41475314389637363,-0.7638108541668416,-0.32809397800398465,0.17698790009968335,0.23614834111924057,-0.012563500100037761,-0.11938633438525845,-0.6077290428311746,0.2731728038025968,0.6857992720888936,-2.2649290237567805,0.6107917250096732,0.5214931158563052,-0.5124281331952253,0.22982743472198852,-0.19480495291259353,0.36680038138995225,-0.048544422164602835,-0.07656519385652429,0.5127539120530336,0.6288248497202542,0.764055262245362,0.8120130548966744,0.4212394456142295,0.37294445752459393,-0.49630349566399595,-0.2558073929845253,-0.6522027330372441,-0.09954569050169651,0.6404616893476884,0.005660644055446851,0.35674793345886424,-0.4667299531471976,-0.03469340038397324,0.3374854434629436,-0.719083817020812,0.864580866782398,0.3585813505613518,0.36702311398191484,0.2760602396617543,0.5569360900895635,0.07293728222188783,0.13860028212849068,0.2419625175224715,0.5551275090492538,0.43803525932363385,0.6063057795449837,0.8112649007696515,-0.6765418823805,0.06410090224979256,-0.41193646393636263,0.44102403278381364,0.19978271208208243,0.17908000843215144,0.09602898988716387,0.7008680151900627,0.7122784380621148,0.05264819223446762,-0.03881487510878939,0.27930130863659447,-0.7143228847142508,0.6387622656627925,0.6306335394358968,-0.11610371899704058,0.0551080359166808,-0.05418356955092786,-0.3023197404951854,0.14983258396382193,0.7582808477770422,0.3099699897340868,-0.9630119334140965,0.7098627096427956,-0.8894837540376336,0.5480711164231781],"yaxis":"y","type":"scatter"},{"customdata":[["Susan","Brooks","IN","Republican","F",64.0],["Liz","Cheney","WY","Republican","F",58.0],["Virginia","Foxx","NC","Republican","F",81.0],["Kay","Granger","TX","Republican","F",81.0],["Jenniffer","Gonz\u00e1lez-Col\u00f3n","PR","Republican","F",48.0],["Vicky","Hartzler","MO","Republican","F",64.0],["Jaime","Herrera Beutler","WA","Republican","F",46.0],["Debbie","Lesko","AZ","Republican","F",66.0],["Cathy","McMorris Rodgers","WA","Republican","F",55.0],["Carol","Miller","WV","Republican","F",74.0],["Martha","Roby","AL","Republican","F",48.0],["Elise","Stefanik","NY","Republican","F",40.0],["Ann","Wagner","MO","Republican","F",62.0],["Jackie","Walorski","IN","Republican","F",61.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003enum votes=%{marker.size}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cbr\u003eage=%{customdata[5]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Republican, F","marker":{"color":"red","opacity":0.7,"size":[41.0,31.0,41.0,41.0,6.0,41.0,41.0,41.0,41.0,41.0,39.0,40.0,41.0,41.0],"sizemode":"area","sizeref":0.41,"symbol":"diamond"},"mode":"markers","name":"Republican, F","orientation":"v","showlegend":true,"x":[2.959075978968458,2.908632108016771,3.0860570512212693,2.955125931497712,2.2870560126995696,3.251035601135717,1.9618607914628134,3.2635713161177553,2.413813078016683,2.747347067342587,2.6998332275779195,1.3609125047899344,3.0364598880224154,3.0451408150929073],"xaxis":"x","y":[0.5991084323816847,-0.8520746305921856,0.1453643906547607,0.5318350075963971,-2.7180606492406274,0.28681235645851155,0.5839212702300784,0.06393487833714204,0.4342240457776483,0.4784482108144816,0.0643561504778278,-0.21817214609795985,0.2126168648691457,0.765091757483773],"yaxis":"y","type":"scatter"},{"customdata":[["Justin","Amash","MI","Independent","M",44.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003enum votes=%{marker.size}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cbr\u003eage=%{customdata[5]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Independent, M","marker":{"color":"green","opacity":0.7,"size":[41.0],"sizemode":"area","sizeref":0.41,"symbol":"circle"},"mode":"markers","name":"Independent, M","orientation":"v","showlegend":true,"x":[0.175043244370895],"xaxis":"x","y":[-2.3951583837687807],"yaxis":"y","type":"scatter"},{"customdata":[["Pete","Aguilar","CA","Democrat","M",45.0],["Colin","Allred","TX","Democrat","M",41.0],["Sanford","Bishop","GA","Democrat","M",77.0],["Earl","Blumenauer","OR","Democrat","M",76.0],["George","Butterfield","NC","Democrat","M",77.0],["Ami","Bera","CA","Democrat","M",59.0],["Donald","Beyer","VA","Democrat","M",74.0],["Brendan","Boyle","PA","Democrat","M",47.0],["Anthony","Brown","MD","Democrat","M",63.0],["Anthony","Brindisi","NY","Democrat","M",46.0],["James","Clyburn","SC","Democrat","M",84.0],["Jim","Cooper","TN","Democrat","M",70.0],["Elijah","Cummings","MD","Democrat","M",73.0],["Wm.","Clay","MO","Democrat","M",68.0],["Ed","Case","HI","Democrat","M",72.0],["Jim","Costa","CA","Democrat","M",72.0],["Emanuel","Cleaver","MO","Democrat","M",80.0],["Henry","Cuellar","TX","Democrat","M",69.0],["Steve","Cohen","TN","Democrat","M",75.0],["Joe","Courtney","CT","Democrat","M",71.0],["Andr\u00e9","Carson","IN","Democrat","M",50.0],["Gerald","Connolly","VA","Democrat","M",74.0],["David","Cicilline","RI","Democrat","M",63.0],["Matthew","Cartwright","PA","Democrat","M",63.0],["Joaquin","Castro","TX","Democrat","M",50.0],["Tony","C\u00e1rdenas","CA","Democrat","M",61.0],["J.","Correa","CA","Democrat","M",66.0],["Charlie","Crist","FL","Democrat","M",68.0],["Salud","Carbajal","CA","Democrat","M",60.0],["Sean","Casten","IL","Democrat","M",53.0],["Jason","Crow","CO","Democrat","M",45.0],["Joe","Cunningham","SC","Democrat","M",42.0],["Gilbert","Cisneros","CA","Democrat","M",53.0],["TJ","Cox","CA","Democrat","M",61.0],["Danny","Davis","IL","Democrat","M",83.0],["Peter","DeFazio","OR","Democrat","M",77.0],["Lloyd","Doggett","TX","Democrat","M",78.0],["Michael","Doyle","PA","Democrat","M",71.0],["Theodore","Deutch","FL","Democrat","M",58.0],["Mark","DeSaulnier","CA","Democrat","M",72.0],["Antonio","Delgado","NY","Democrat","M",47.0],["Eliot","Engel","NY","Democrat","M",77.0],["Dwight","Evans","PA","Democrat","M",70.0],["Adriano","Espaillat","NY","Democrat","M",70.0],["Bill","Foster","IL","Democrat","M",69.0],["Ra\u00fal","Grijalva","AZ","Democrat","M",76.0],["Al","Green","TX","Democrat","M",77.0],["John","Garamendi","CA","Democrat","M",79.0],["Ruben","Gallego","AZ","Democrat","M",45.0],["Vicente","Gonzalez","TX","Democrat","M",57.0],["Josh","Gottheimer","NJ","Democrat","M",49.0],["Jimmy","Gomez","CA","Democrat","M",50.0],["Jes\u00fas","Garc\u00eda","IL","Democrat","M",68.0],["Jared","Golden","ME","Democrat","M",42.0],["Alcee","Hastings","FL","Democrat","M",88.0],["Steny","Hoyer","MD","Democrat","M",85.0],["Brian","Higgins","NY","Democrat","M",65.0],["James","Himes","CT","Democrat","M",58.0],["Denny","Heck","WA","Democrat","M",72.0],["Steven","Horsford","NV","Democrat","M",51.0],["Jared","Huffman","CA","Democrat","M",60.0],["Josh","Harder","CA","Democrat","M",38.0],["Henry","Johnson","GA","Democrat","M",70.0],["Hakeem","Jeffries","NY","Democrat","M",54.0],["Ron","Kind","WI","Democrat","M",61.0],["William","Keating","MA","Democrat","M",72.0],["Joseph","Kennedy","MA","Democrat","M",44.0],["Daniel","Kildee","MI","Democrat","M",66.0],["Derek","Kilmer","WA","Democrat","M",50.0],["Ro","Khanna","CA","Democrat","M",48.0],["Raja","Krishnamoorthi","IL","Democrat","M",51.0],["Andy","Kim","NJ","Democrat","M",42.0],["John","Lewis","GA","Democrat","M",84.0],["John","Larson","CT","Democrat","M",76.0],["James","Langevin","RI","Democrat","M",60.0],["Rick","Larsen","WA","Democrat","M",59.0],["Stephen","Lynch","MA","Democrat","M",69.0],["Daniel","Lipinski","IL","Democrat","M",58.0],["David","Loebsack","IA","Democrat","M",72.0],["Ben","Luj\u00e1n","NM","Democrat","M",52.0],["Alan","Lowenthal","CA","Democrat","M",83.0],["Ted","Lieu","CA","Democrat","M",55.0],["Al","Lawson","FL","Democrat","M",76.0],["Conor","Lamb","PA","Democrat","M",40.0],["Andy","Levin","MI","Democrat","M",64.0],["Mike","Levin","CA","Democrat","M",46.0],["James","McGovern","MA","Democrat","M",65.0],["Gregory","Meeks","NY","Democrat","M",71.0],["Jerry","McNerney","CA","Democrat","M",73.0],["Sean","Maloney","NY","Democrat","M",58.0],["Seth","Moulton","MA","Democrat","M",46.0],["Tom","Malinowski","NJ","Democrat","M",59.0],["Joseph","Morelle","NY","Democrat","M",67.0],["Ben","McAdams","UT","Democrat","M",50.0],["Jerrold","Nadler","NY","Democrat","M",77.0],["Richard","Neal","MA","Democrat","M",75.0],["Donald","Norcross","NJ","Democrat","M",66.0],["Joe","Neguse","CO","Democrat","M",40.0],["Tom","O\u2019Halleran","AZ","Democrat","M",78.0],["Frank","Pallone","NJ","Democrat","M",73.0],["Bill","Pascrell","NJ","Democrat","M",87.0],["Collin","Peterson","MN","Democrat","M",80.0],["David","Price","NC","Democrat","M",84.0],["Ed","Perlmutter","CO","Democrat","M",71.0],["Donald","Payne","NJ","Democrat","M",66.0],["Mark","Pocan","WI","Democrat","M",60.0],["Scott","Peters","CA","Democrat","M",66.0],["Jimmy","Panetta","CA","Democrat","M",55.0],["Chris","Pappas","NH","Democrat","M",44.0],["Dean","Phillips","MN","Democrat","M",55.0],["Mike","Quigley","IL","Democrat","M",66.0],["Bobby","Rush","IL","Democrat","M",78.0],["C.","Ruppersberger","MD","Democrat","M",78.0],["Tim","Ryan","OH","Democrat","M",51.0],["Cedric","Richmond","LA","Democrat","M",51.0],["Raul","Ruiz","CA","Democrat","M",52.0],["Jamie","Raskin","MD","Democrat","M",62.0],["Max","Rose","NY","Democrat","M",38.0],["Harley","Rouda","CA","Democrat","M",63.0],["Robert","Scott","VA","Democrat","M",77.0],["Jos\u00e9","Serrano","NY","Democrat","M",81.0],["Brad","Sherman","CA","Democrat","M",70.0],["Adam","Smith","WA","Democrat","M",59.0],["Adam","Schiff","CA","Democrat","M",64.0],["David","Scott","GA","Democrat","M",79.0],["Albio","Sires","NJ","Democrat","M",73.0],["John","Sarbanes","MD","Democrat","M",62.0],["Gregorio","Sablan","MP","Democrat","M",69.0],["Kurt","Schrader","OR","Democrat","M",73.0],["Bradley","Schneider","IL","Democrat","M",63.0],["Eric","Swalwell","CA","Democrat","M",44.0],["Darren","Soto","FL","Democrat","M",46.0],["Thomas","Suozzi","NY","Democrat","M",62.0],["Michael","San Nicolas","GU","Democrat","M",43.0],["Greg","Stanton","AZ","Democrat","M",54.0],["Bennie","Thompson","MS","Democrat","M",76.0],["Mike","Thompson","CA","Democrat","M",73.0],["Paul","Tonko","NY","Democrat","M",75.0],["Mark","Takano","CA","Democrat","M",64.0],["David","Trone","MD","Democrat","M",69.0],["Peter","Visclosky","IN","Democrat","M",75.0],["Juan","Vargas","CA","Democrat","M",63.0],["Marc","Veasey","TX","Democrat","M",53.0],["Filemon","Vela","TX","Democrat","M",61.0],["Jefferson","Van Drew","NJ","Democrat","M",71.0],["Peter","Welch","VT","Democrat","M",77.0],["John","Yarmuth","KY","Democrat","M",77.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003enum votes=%{marker.size}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cbr\u003eage=%{customdata[5]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Democrat, M","marker":{"color":"blue","opacity":0.7,"size":[41.0,41.0,41.0,39.0,40.0,41.0,39.0,40.0,41.0,41.0,4.0,41.0,9.0,40.0,41.0,39.0,41.0,41.0,41.0,41.0,41.0,41.0,40.0,41.0,36.0,41.0,36.0,41.0,41.0,41.0,41.0,36.0,41.0,41.0,39.0,39.0,41.0,41.0,39.0,39.0,41.0,41.0,41.0,41.0,40.0,39.0,41.0,34.0,40.0,38.0,41.0,41.0,41.0,41.0,41.0,40.0,41.0,37.0,41.0,41.0,38.0,40.0,41.0,41.0,32.0,41.0,41.0,41.0,41.0,41.0,40.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,38.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,40.0,41.0,40.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,40.0,41.0,41.0,41.0,41.0,40.0,41.0,41.0,41.0,40.0,41.0,40.0,33.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,39.0,41.0,40.0,41.0,41.0,7.0,41.0,41.0,41.0,41.0,41.0,6.0,41.0,31.0,41.0,41.0,41.0,40.0,41.0,41.0,41.0,41.0,40.0,39.0,41.0],"sizemode":"area","sizeref":0.41,"symbol":"circle"},"mode":"markers","name":"Democrat, M","orientation":"v","showlegend":true,"x":[-2.6309510022649336,-2.654483103386093,-2.654108640149626,-2.443066946180302,-2.684170455434355,-2.6675995337844784,-2.4138189935874137,-2.7133067576881884,-2.5903988366736006,-2.3551646300155507,0.7320441385293712,-2.6460438928342294,0.4640968784915345,-2.54830114822158,-2.6914574171148615,-1.7994067224196348,-2.5744093216522415,-1.955245235116098,-2.7271563272006647,-2.526283082553178,-2.4719116619355157,-2.6147366769506313,-2.566148069500859,-2.5713961042417566,-2.2758422499046125,-2.8695101752015724,-2.6284027980112987,-2.629502853576349,-2.571824407641423,-2.6883850250819163,-2.660511685169302,-1.3974122812902758,-2.5978279098577937,-2.510671165739309,-2.4977477288926395,-2.4586071664311113,-2.57792363708614,-2.5814301375746087,-2.6068671352288795,-2.569063691424213,-2.449883913066816,-2.3612114535440694,-2.611006841763122,-2.724403468554551,-2.467256233698988,-2.5614571839514437,-2.6858609940262217,-1.5979546320599516,-2.5763233925807802,-1.9353447841683804,-2.248535388226382,-2.5843102950764973,-2.5782284174627295,-2.380648376198812,-2.5288272044183526,-2.4936092192049264,-2.689603896627769,-1.9636999575139407,-2.69648710775515,-2.615245835734008,-2.2212402502147834,-2.2725955185475293,-2.6521871599993,-2.7321095426637965,-1.5208726209752277,-2.745302601588307,-2.556157069701377,-2.556030896161993,-2.2222625157271256,-2.6390625892566324,-2.348577708054456,-2.6312175254665986,-2.5993482508539656,-2.6547288513715412,-2.4986406050958614,-2.4271291538868063,-2.7153101425855284,-2.6790660357185945,-2.5402998365086202,-2.6726956995531794,-2.8100499234583585,-2.6737143112694364,-2.6190093089392876,-2.392984620548739,-2.6550301959086933,-2.672868588049969,-2.5670374936965024,-2.5505630255372873,-2.530072259449664,-2.5299760315651025,-2.5944616070639888,-2.5613253173662707,-2.6005554562935957,-1.862396562428526,-2.7313172146752827,-2.394232327326971,-2.5794364780190944,-2.656835758258481,-2.5225754550904953,-2.5175756772492726,-2.576805712704937,-0.33769157604137606,-2.690435165884805,-2.6635537688122946,-2.5958029264617197,-2.666692803676181,-2.5476019338886187,-2.4976579795938787,-2.524994029893795,-2.476987784077165,-2.640495452765012,-2.597836068293326,-2.6647094117242633,-1.8604323479302798,-2.491724577442591,-2.5447011138661697,-2.5100236914508227,-2.689757704349241,-2.5830390076817253,-2.658229982329711,-2.584330671056834,-2.7523440989423302,-2.4922581448064323,-2.5217697024723957,-2.6113703978959353,-2.7077886012563788,-2.6093869783918366,1.2990684873682345,-2.293958634221637,-2.585126416611556,-2.5977680549377147,-2.462421304013094,-2.5116085821840772,1.462452081695273,-2.667474929337,-1.3859762767698447,-2.665226030358838,-2.4243897882071623,-2.6544532300030683,-2.669406279401044,-2.714297707730857,-2.506134119464307,-2.595506101621408,-2.0952223316158105,-1.6764485049676445,-2.310214863471586,-2.4446741102354688],"xaxis":"x","y":[0.1190653602087769,0.03398938552732422,0.12170945443274062,-0.5682484416395581,-0.24856212393906912,0.2037281351647902,-0.1592001128229965,-0.023541742059795367,0.11518560501198316,0.45042831364270575,-3.3060145477721563,0.12718010002839808,-2.951049838733728,-0.10624112861027371,0.18186836852852567,-0.11086678209611044,0.10894349635169899,-0.2601037047108851,-0.011879393221854329,0.1842742878291169,0.06291310724787709,0.07926482578359018,-0.17729263076665158,0.04488235251895603,0.09090541439867103,0.03722099795590002,0.07379296236360973,0.20311607649723873,-0.038963163952077384,0.18503723101937575,0.132785858826198,0.2566566800171811,0.03754570164961894,0.1918366100377972,-0.10861206654450847,-0.17279771723563636,0.1327172312917012,0.04193602763166457,-0.4794965371069007,-0.3611424093877645,0.3419742039050498,0.24475557032604056,0.08678966754781306,0.07921674175281046,0.33551744402460365,-0.43733485457561855,-0.09313616674072872,-0.5715630426236284,0.13617573198176963,-0.002173421120737512,0.5332200118930753,0.175875346734699,0.16134357468875465,0.5637934374756213,0.203676225824595,-0.20619449179279808,0.21493795617062583,0.12452405718843403,0.2775817953089781,0.1224188970690143,-0.13178483504081126,0.37873346350872666,0.23739651600715467,0.11217657430216087,-1.1047879211057712,-0.02233693115175081,-0.14171130013409736,0.12254787750827859,0.10488391113919202,0.017324773285892797,0.32212914721255836,0.12937029335135444,0.14778584023684535,0.0736348870466833,0.056692785892343375,0.2976793094700134,0.10247964241797429,0.27837666301832864,-0.554611988497435,0.09370860475202493,0.05019569540097227,0.3032044085024221,0.20014457056438328,0.2078203500517131,0.19927720717729863,0.10394482434424632,0.19907336098040898,0.10921949803617628,0.22038006608074992,-0.033398716396408734,-0.08021557662831241,0.12421389391757497,0.14300754030624757,0.6623899635535536,-0.006841178050511487,0.2023037735623313,0.16273553469622692,0.14756188966982797,0.168802639731507,0.1752371124013641,0.15358033759144737,0.5000138949689663,0.22941437066187287,-0.056110058973104726,0.17974326620697212,0.14665404060776766,-0.16643184703400224,0.03565403954896937,-0.00719109640157356,0.23334154382084377,-0.06464616928694662,0.1980083521009589,-0.15707385924829842,-1.088426345778975,0.05765972164747106,0.06528065838128307,0.22523280912273277,0.1900445739151276,-0.005557071710950323,0.16599714926065612,0.18395640895316379,0.23605543671552198,-0.16255280517834916,0.14874613287637092,-0.24165281695706653,0.019871710119233568,0.23329264949335016,-3.530220002369647,0.4271436057553596,0.09706015181319587,0.0801025019937121,0.08400425833264231,0.27267515259543307,-3.513540138668064,0.07624851915002259,-0.8035502199370452,0.08595868245047406,0.02756328748485301,0.1245088754213817,-0.11892433034027884,0.1772224905608747,-0.09091014285290815,0.3735073781511492,-0.30264339123279727,0.5308679134631547,-0.47888197952953343,0.3074331505118386],"yaxis":"y","type":"scatter"},{"customdata":[["Alma","Adams","NC","Democrat","F",78.0],["Cynthia","Axne","IA","Democrat","F",59.0],["Karen","Bass","CA","Democrat","F",71.0],["Suzanne","Bonamici","OR","Democrat","F",70.0],["Joyce","Beatty","OH","Democrat","F",74.0],["Julia","Brownley","CA","Democrat","F",72.0],["Cheri","Bustos","IL","Democrat","F",63.0],["Nanette","Barrag\u00e1n","CA","Democrat","F",48.0],["Lisa","Blunt Rochester","DE","Democrat","F",62.0],["Kathy","Castor","FL","Democrat","F",58.0],["Yvette","Clarke","NY","Democrat","F",60.0],["Judy","Chu","CA","Democrat","F",71.0],["Katherine","Clark","MA","Democrat","F",61.0],["Angie","Craig","MN","Democrat","F",52.0],["Diana","DeGette","CO","Democrat","F",67.0],["Rosa","DeLauro","CT","Democrat","F",81.0],["Susan","Davis","CA","Democrat","F",80.0],["Suzan","DelBene","WA","Democrat","F",62.0],["Debbie","Dingell","MI","Democrat","F",71.0],["Val","Demings","FL","Democrat","F",67.0],["Sharice","Davids","KS","Democrat","F",44.0],["Madeleine","Dean","PA","Democrat","F",65.0],["Anna","Eshoo","CA","Democrat","F",82.0],["Veronica","Escobar","TX","Democrat","F",55.0],["Marcia","Fudge","OH","Democrat","F",72.0],["Lois","Frankel","FL","Democrat","F",76.0],["Abby","Finkenauer","IA","Democrat","F",36.0],["Lizzie","Fletcher","TX","Democrat","F",49.0],["Tulsi","Gabbard","HI","Democrat","F",43.0],["Sylvia","Garcia","TX","Democrat","F",74.0],["Debra","Haaland","NM","Democrat","F",64.0],["Jahana","Hayes","CT","Democrat","F",51.0],["Kendra","Horn","OK","Democrat","F",48.0],["Chrissy","Houlahan","PA","Democrat","F",57.0],["Katie","Hill","CA","Democrat","F",37.0],["Sheila","Jackson Lee","TX","Democrat","F",74.0],["Eddie","Johnson","TX","Democrat","F",89.0],["Pramila","Jayapal","WA","Democrat","F",59.0],["Marcy","Kaptur","OH","Democrat","F",78.0],["Ann","Kirkpatrick","AZ","Democrat","F",74.0],["Ann","Kuster","NH","Democrat","F",68.0],["Robin","Kelly","IL","Democrat","F",68.0],["Zoe","Lofgren","CA","Democrat","F",77.0],["Nita","Lowey","NY","Democrat","F",87.0],["Barbara","Lee","CA","Democrat","F",78.0],["Brenda","Lawrence","MI","Democrat","F",70.0],["Susie","Lee","NV","Democrat","F",58.0],["Elaine","Luria","VA","Democrat","F",49.0],["Carolyn","Maloney","NY","Democrat","F",78.0],["Betty","McCollum","MN","Democrat","F",70.0],["Gwen","Moore","WI","Democrat","F",73.0],["Doris","Matsui","CA","Democrat","F",80.0],["Grace","Meng","NY","Democrat","F",49.0],["Stephanie","Murphy","FL","Democrat","F",46.0],["Debbie","Mucarsel-Powell","FL","Democrat","F",53.0],["Lucy","McBath","GA","Democrat","F",64.0],["Eleanor","Norton","DC","Democrat","F",87.0],["Grace","Napolitano","CA","Democrat","F",88.0],["Alexandria","Ocasio-Cortez","NY","Democrat","F",35.0],["Ilhan","Omar","MN","Democrat","F",43.0],["Chellie","Pingree","ME","Democrat","F",69.0],["Stacey","Plaskett","VI","Democrat","F",58.0],["Ayanna","Pressley","MA","Democrat","F",50.0],["Katie","Porter","CA","Democrat","F",50.0],["Lucille","Roybal-Allard","CA","Democrat","F",83.0],["Kathleen","Rice","NY","Democrat","F",59.0],["Janice","Schakowsky","IL","Democrat","F",80.0],["Linda","S\u00e1nchez","CA","Democrat","F",55.0],["Jackie","Speier","CA","Democrat","F",74.0],["Terri","Sewell","AL","Democrat","F",59.0],["Mary","Scanlon","PA","Democrat","F",65.0],["Donna","Shalala","FL","Democrat","F",83.0],["Mikie","Sherrill","NJ","Democrat","F",52.0],["Elissa","Slotkin","MI","Democrat","F",48.0],["Abigail","Spanberger","VA","Democrat","F",45.0],["Haley","Stevens","MI","Democrat","F",41.0],["Kim","Schrier","WA","Democrat","F",56.0],["Dina","Titus","NV","Democrat","F",74.0],["Norma","Torres","CA","Democrat","F",59.0],["Rashida","Tlaib","MI","Democrat","F",48.0],["Lori","Trahan","MA","Democrat","F",51.0],["Xochitl","Torres Small","NM","Democrat","F",40.0],["Lauren","Underwood","IL","Democrat","F",38.0],["Nydia","Vel\u00e1zquez","NY","Democrat","F",71.0],["Maxine","Waters","CA","Democrat","F",86.0],["Debbie","Wasserman Schultz","FL","Democrat","F",58.0],["Frederica","Wilson","FL","Democrat","F",82.0],["Bonnie","Watson Coleman","NJ","Democrat","F",79.0],["Jennifer","Wexton","VA","Democrat","F",56.0],["Susan","Wild","PA","Democrat","F",67.0]],"hovertemplate":"party=%{customdata[3]}\u003cbr\u003egender=%{customdata[4]}\u003cbr\u003ez1_jittered=%{x}\u003cbr\u003ez2_jittered=%{y}\u003cbr\u003enum votes=%{marker.size}\u003cbr\u003efirst=%{customdata[0]}\u003cbr\u003elast=%{customdata[1]}\u003cbr\u003estate=%{customdata[2]}\u003cbr\u003eage=%{customdata[5]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Democrat, F","marker":{"color":"blue","opacity":0.7,"size":[41.0,41.0,39.0,41.0,34.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,40.0,41.0,41.0,41.0,41.0,40.0,41.0,34.0,41.0,41.0,41.0,41.0,30.0,41.0,41.0,41.0,41.0,41.0,38.0,38.0,41.0,41.0,41.0,41.0,37.0,41.0,39.0,41.0,40.0,32.0,41.0,41.0,40.0,41.0,41.0,41.0,39.0,41.0,41.0,41.0,6.0,41.0,41.0,41.0,39.0,7.0,41.0,41.0,41.0,36.0,41.0,41.0,39.0,40.0,40.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,40.0,41.0,41.0,41.0,41.0,41.0,41.0,40.0,36.0,41.0,40.0,41.0],"sizemode":"area","sizeref":0.41,"symbol":"diamond"},"mode":"markers","name":"Democrat, F","orientation":"v","showlegend":true,"x":[-2.4552326791517918,-2.3295219784573264,-2.561299039681855,-2.6007828443238052,-1.9096261483144796,-2.7226350225348286,-2.569965862958032,-2.8035026771805756,-2.533689006793057,-2.6261015624589796,-2.7181691621932007,-2.5071823750033952,-2.5253454143550753,-2.351943136676577,-2.6467464801058136,-2.7538871596058097,-2.58237599773667,-2.6877633917147596,-2.418917074671545,-2.5817806257203215,-2.799412786322502,-2.4189989930861455,-2.62677176127071,-1.6236636575101386,-2.747720771071826,-2.7065892973056664,-2.3335277745877,-2.2423873997258754,-2.0272867019598078,-2.336645956663286,-2.5449689300160974,-2.693251420434226,-2.234932967651761,-2.629881943325183,-2.1749550057238323,-2.3297081405993953,-2.7590203872611845,-2.5690039268197142,-2.72183269457566,-2.5021554595891073,-2.006752353351375,-2.493979100774538,-2.1889478660138377,-2.7942621840517727,-2.7441589305198812,-1.8853847737241494,-2.7275653054981754,-2.469476243770058,-2.423817909726335,-2.5157494700871204,-2.4808445498739355,-2.678302611354476,-2.590108379220323,-2.5453833906266894,-2.6460670244787736,-2.596183930267473,1.0519250559918671,-2.622714174296156,-2.571834116196093,-2.676826624318663,-2.390600700511512,1.2253657156213953,-2.605435280629322,-2.5393403676631014,-2.620549970229366,-2.4220654507318367,-2.5764449082327974,-2.6235295177889406,-2.4330582049900293,-2.465683858296706,-2.6757815683271127,-2.52030360112053,-2.2648460086022446,-2.2245030551250173,-2.364010037711595,-2.592496286144977,-2.5733600672148818,-2.739558985494662,-2.675235371102621,-2.4856871459083205,-2.7669784306720344,-2.275198610943442,-2.652542211940522,-2.545250671557848,-2.7199998739763847,-2.390136174225906,-2.5738839080687477,-2.7834096134349093,-2.6273040171989,-2.5357814392130016],"xaxis":"x","y":[-0.0787668458439274,0.4201420359822415,-0.22981614945065304,0.13150371935723265,-0.7468780695986303,0.10588040420673792,0.18939403416440231,0.18563306046563643,0.23993485530351752,0.22059511891322395,0.31891902820707285,0.06873797173617288,0.07030023110322653,0.1402373171069841,0.006875744183814161,0.06279375337984389,-0.20773039513581712,0.1809577819515881,0.04828783861301213,0.10767282555576053,0.06757884548847776,0.04689733301550544,0.15341944849018113,-0.38605011534693395,0.21508983450322905,0.08716982685790467,0.33898419606855046,-0.08653765554091009,-0.8265252108693767,0.11648531962838364,0.15036576658797296,0.04892991861552862,0.7142658378186395,0.2625193687755576,-0.1271595740687031,-0.020551059932923544,0.13354985538892927,0.026766926634043037,-0.001018725814045579,0.10201822902369934,-0.2823699078566472,0.3128729736545943,-0.22290685429358711,0.07061116347203618,-0.1725501677344344,-0.5127818602366916,0.1769148202828516,0.17449517903656422,-0.12796014250397195,0.09188074757815182,0.09702272022866078,0.1605906663842716,-0.5774702675069762,0.3732288365929669,0.09161614293867422,0.12228280178795306,-3.5161090725064272,0.1095790305247919,-0.04228099626372425,0.23089292789262417,-0.549765046828156,-3.6738617260040436,0.0330374752768655,0.39121557064379886,0.04214158614193847,0.00761305423126038,-0.0381187291915393,0.15513524787959715,-0.35365728041732336,-0.31564368919930946,-0.028510489160968183,-0.0205786731436374,0.5527684147580619,0.4096886067357797,0.5342917870986785,0.08342711201998262,0.2733158118718299,0.03611216951163998,-0.05987897010466056,-0.01554571952737717,0.10657847973750892,0.569527770140119,0.16621833877846487,0.1446225849813654,0.06566331151758337,0.19905638848638857,-0.47283501713588083,0.19383179142734286,-0.1703077862028576,-0.009954558641392913],"yaxis":"y","type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"515","textposition":"top right","x":[0,-0.21536786898289997],"y":[0,1.1473909690650255],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"516","textposition":"top right","x":[0,-0.8468350112378241],"y":[0,0.9301644135472973],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"517","textposition":"top right","x":[0,-1.3756411125001378],"y":[0,0.18071466385821425],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"518","textposition":"top right","x":[0,-1.3716056174267737],"y":[0,0.19730061165177465],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"519","textposition":"top right","x":[0,-0.039020545165022534],"y":[0,0.8270809316139355],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"520","textposition":"top right","x":[0,1.2862737076960111],"y":[0,0.5481314715088903],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"521","textposition":"top right","x":[0,-1.249945397803938],"y":[0,0.2075525450769836],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"522","textposition":"top right","x":[0,1.1493605953661796],"y":[0,0.4980539728494951],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"523","textposition":"top right","x":[0,1.268619202810077],"y":[0,0.5483412048955396],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"524","textposition":"top right","x":[0,1.3308438498678805],"y":[0,0.593160511697517],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"525","textposition":"top right","x":[0,-1.3070455500360574],"y":[0,0.2279452458343206],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"526","textposition":"top right","x":[0,1.2995301764114735],"y":[0,0.6010602187967713],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"527","textposition":"top right","x":[0,1.28285928560688],"y":[0,0.6098233241134955],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"528","textposition":"top right","x":[0,1.293936700381543],"y":[0,0.5983661328849362],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"529","textposition":"top right","x":[0,1.310875257580959],"y":[0,0.6201053597405578],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"530","textposition":"top right","x":[0,-1.3287452010191367],"y":[0,0.24229107997859853],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"531","textposition":"top right","x":[0,1.2810882794938898],"y":[0,0.6376080460538094],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"532","textposition":"top right","x":[0,0.014439356905162216],"y":[0,0.9143134970361472],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"533","textposition":"top right","x":[0,-1.3612617012265453],"y":[0,0.3157520978721469],"type":"scatter"},{"marker":{"angleref":"previous","size":10,"symbol":"arrow-bar-up"},"mode":"lines+markers","name":"534","textposition":"top right","x":[0,-1.3708632820259319],"y":[0,0.30762440911791344],"type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"z1_jittered"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"z2_jittered"}},"legend":{"title":{"text":"party, gender"},"tracegroupgap":0,"itemsizing":"constant"},"title":{"text":"Biplot"},"height":600,"width":800},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('55ae8e5d-ffb6-4f08-9443-a535fff5ce66');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>Each roll call from the 116th Congress - 1st Session: https://clerk.house.gov/evs/2019/ROLL_500.asp</p>
+<ul>
+<li>555: Raising a question of the privileges of the House (<a href="https://www.congress.gov/bill/116th-congress/house-resolution/590">H.Res.590</a>)</li>
+<li>553: [https://www.congress.gov/bill/116th-congress/senate-joint-resolution/54/actions]</li>
+<li>527: On Agreeing to the Amendment <a href="https://www.congress.gov/bill/116th-congress/house-bill/1146">H.R.1146 - Arctic Cultural and Coastal Plain Protection Act</a></li>
+</ul>
+<p>As shown in the demo, the primary goal of PCA is to transform observations from high-dimensional data down to low dimensions through linear transformations. <!-- 
+A related goal of PCA connects back to the idea that a low-dimension representation of the data should capture the variability of the original data. For example, if the first two singular values are large and the others are relatively small, then two dimensions are probably enough to describe most of what distinguishes one observation from another. However, if this is not the case, then a PCA scatter plot is probably omitting lots of information.
+
+We can use the following formulas to quantify the amount each principal component contributes to the total variance:
+
+$$ \text{component score} = \frac{s_i^{2}}{N}$$
+
+$$ \text{total variance} = \text{sum of all the component scores} = \sum_{i=1} \frac{s_i^{2}}{N} $$
+
+$$ \text{variance ratio of principal component j} = \frac{\text{component score j}}{\text{total variance}} = \frac{s_j^{2} / N}{\sum_{i=1}^k s_i^{2} / N}$$
+
+In Python, assuming you had a 1D `NumPy` array of singular values `s` returned by `np.linalg.svd`, you could compute the list of variances ratios with `s**2 / sum(s**2)`. --></p>
+</section>
+</section>
+<section id="example-2-image-classification" class="level2" data-number="25.7">
+<h2 data-number="25.7" class="anchored" data-anchor-id="example-2-image-classification"><span class="header-section-number">25.7</span> Example 2: Image Classification</h2>
+<p>In machine learning, PCA is often used as a preprocessing step prior to training a supervised model.</p>
+<p>Let’s explore how PCA is useful for building an image classification model based on the Fashion-MNIST dataset, a dataset containing images of articles of clothing; these images are gray scale with a size of 28 by 28 pixels. The copyright for Fashion-MNIST is held by <a href="https://github.com/zalandoresearch/fashion-mnist">Zalando SE</a>. Fashion-MNIST is licensed under the <a href="https://github.com/zalandoresearch/fashion-mnist/blob/master/LICENSE">MIT license</a>.</p>
+<center>
+<img src="images/mnist.png" alt="slide21" width="500">
+</center>
+<p>First, we’ll load in the data.</p>
+<div id="e55ab044" class="cell" data-execution_count="23">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> requests</span>
+<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> pathlib <span class="im">import</span> Path</span>
+<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> time</span>
+<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> gzip</span>
+<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
+<span id="cb29-6"><a href="#cb29-6" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb29-7"><a href="#cb29-7" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> plotly.express <span class="im">as</span> px</span>
+<span id="cb29-8"><a href="#cb29-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-9"><a href="#cb29-9" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> fetch_and_cache(data_url, <span class="bu">file</span>, data_dir<span class="op">=</span><span class="st">"data"</span>, force<span class="op">=</span><span class="va">False</span>):</span>
+<span id="cb29-10"><a href="#cb29-10" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
+<span id="cb29-11"><a href="#cb29-11" aria-hidden="true" tabindex="-1"></a><span class="co">    Download and cache a url and return the file object.</span></span>
+<span id="cb29-12"><a href="#cb29-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-13"><a href="#cb29-13" aria-hidden="true" tabindex="-1"></a><span class="co">    data_url: the web address to download</span></span>
+<span id="cb29-14"><a href="#cb29-14" aria-hidden="true" tabindex="-1"></a><span class="co">    file: the file in which to save the results.</span></span>
+<span id="cb29-15"><a href="#cb29-15" aria-hidden="true" tabindex="-1"></a><span class="co">    data_dir: (default="data") the location to save the data</span></span>
+<span id="cb29-16"><a href="#cb29-16" aria-hidden="true" tabindex="-1"></a><span class="co">    force: if true the file is always re-downloaded</span></span>
+<span id="cb29-17"><a href="#cb29-17" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-18"><a href="#cb29-18" aria-hidden="true" tabindex="-1"></a><span class="co">    return: The pathlib.Path object representing the file.</span></span>
+<span id="cb29-19"><a href="#cb29-19" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
+<span id="cb29-20"><a href="#cb29-20" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-21"><a href="#cb29-21" aria-hidden="true" tabindex="-1"></a>    data_dir <span class="op">=</span> Path(data_dir)</span>
+<span id="cb29-22"><a href="#cb29-22" aria-hidden="true" tabindex="-1"></a>    data_dir.mkdir(exist_ok<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb29-23"><a href="#cb29-23" aria-hidden="true" tabindex="-1"></a>    file_path <span class="op">=</span> data_dir <span class="op">/</span> Path(<span class="bu">file</span>)</span>
+<span id="cb29-24"><a href="#cb29-24" aria-hidden="true" tabindex="-1"></a>    <span class="co"># If the file already exists and we want to force a download then</span></span>
+<span id="cb29-25"><a href="#cb29-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># delete the file first so that the creation date is correct.</span></span>
+<span id="cb29-26"><a href="#cb29-26" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> force <span class="kw">and</span> file_path.exists():</span>
+<span id="cb29-27"><a href="#cb29-27" aria-hidden="true" tabindex="-1"></a>        file_path.unlink()</span>
+<span id="cb29-28"><a href="#cb29-28" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> force <span class="kw">or</span> <span class="kw">not</span> file_path.exists():</span>
+<span id="cb29-29"><a href="#cb29-29" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="st">"Downloading..."</span>, end<span class="op">=</span><span class="st">" "</span>)</span>
+<span id="cb29-30"><a href="#cb29-30" aria-hidden="true" tabindex="-1"></a>        resp <span class="op">=</span> requests.get(data_url)</span>
+<span id="cb29-31"><a href="#cb29-31" aria-hidden="true" tabindex="-1"></a>        <span class="cf">with</span> file_path.<span class="bu">open</span>(<span class="st">"wb"</span>) <span class="im">as</span> f:</span>
+<span id="cb29-32"><a href="#cb29-32" aria-hidden="true" tabindex="-1"></a>            f.write(resp.content)</span>
+<span id="cb29-33"><a href="#cb29-33" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="st">"Done!"</span>)</span>
+<span id="cb29-34"><a href="#cb29-34" aria-hidden="true" tabindex="-1"></a>        last_modified_time <span class="op">=</span> time.ctime(file_path.stat().st_mtime)</span>
+<span id="cb29-35"><a href="#cb29-35" aria-hidden="true" tabindex="-1"></a>    <span class="cf">else</span>:</span>
+<span id="cb29-36"><a href="#cb29-36" aria-hidden="true" tabindex="-1"></a>        last_modified_time <span class="op">=</span> time.ctime(file_path.stat().st_mtime)</span>
+<span id="cb29-37"><a href="#cb29-37" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="st">"Using cached version that was downloaded (UTC):"</span>, last_modified_time)</span>
+<span id="cb29-38"><a href="#cb29-38" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> file_path</span>
+<span id="cb29-39"><a href="#cb29-39" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-40"><a href="#cb29-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-41"><a href="#cb29-41" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> head(filename, lines<span class="op">=</span><span class="dv">5</span>):</span>
+<span id="cb29-42"><a href="#cb29-42" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
+<span id="cb29-43"><a href="#cb29-43" aria-hidden="true" tabindex="-1"></a><span class="co">    Returns the first few lines of a file.</span></span>
+<span id="cb29-44"><a href="#cb29-44" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-45"><a href="#cb29-45" aria-hidden="true" tabindex="-1"></a><span class="co">    filename: the name of the file to open</span></span>
+<span id="cb29-46"><a href="#cb29-46" aria-hidden="true" tabindex="-1"></a><span class="co">    lines: the number of lines to include</span></span>
+<span id="cb29-47"><a href="#cb29-47" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-48"><a href="#cb29-48" aria-hidden="true" tabindex="-1"></a><span class="co">    return: A list of the first few lines from the file.</span></span>
+<span id="cb29-49"><a href="#cb29-49" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
+<span id="cb29-50"><a href="#cb29-50" aria-hidden="true" tabindex="-1"></a>    <span class="im">from</span> itertools <span class="im">import</span> islice</span>
+<span id="cb29-51"><a href="#cb29-51" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-52"><a href="#cb29-52" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(filename, <span class="st">"r"</span>) <span class="im">as</span> f:</span>
+<span id="cb29-53"><a href="#cb29-53" aria-hidden="true" tabindex="-1"></a>        <span class="cf">return</span> <span class="bu">list</span>(islice(f, lines))</span>
+<span id="cb29-54"><a href="#cb29-54" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-55"><a href="#cb29-55" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-56"><a href="#cb29-56" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> load_data():</span>
+<span id="cb29-57"><a href="#cb29-57" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
+<span id="cb29-58"><a href="#cb29-58" aria-hidden="true" tabindex="-1"></a><span class="co">    Loads the Fashion-MNIST dataset.</span></span>
+<span id="cb29-59"><a href="#cb29-59" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-60"><a href="#cb29-60" aria-hidden="true" tabindex="-1"></a><span class="co">    This is a dataset of 60,000 28x28 grayscale images of 10 fashion categories,</span></span>
+<span id="cb29-61"><a href="#cb29-61" aria-hidden="true" tabindex="-1"></a><span class="co">    along with a test set of 10,000 images. This dataset can be used as</span></span>
+<span id="cb29-62"><a href="#cb29-62" aria-hidden="true" tabindex="-1"></a><span class="co">    a drop-in replacement for MNIST.</span></span>
+<span id="cb29-63"><a href="#cb29-63" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-64"><a href="#cb29-64" aria-hidden="true" tabindex="-1"></a><span class="co">    The classes are:</span></span>
+<span id="cb29-65"><a href="#cb29-65" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-66"><a href="#cb29-66" aria-hidden="true" tabindex="-1"></a><span class="co">    | Label | Description |</span></span>
+<span id="cb29-67"><a href="#cb29-67" aria-hidden="true" tabindex="-1"></a><span class="co">    |:-----:|-------------|</span></span>
+<span id="cb29-68"><a href="#cb29-68" aria-hidden="true" tabindex="-1"></a><span class="co">    |   0   | T-shirt/top |</span></span>
+<span id="cb29-69"><a href="#cb29-69" aria-hidden="true" tabindex="-1"></a><span class="co">    |   1   | Trouser     |</span></span>
+<span id="cb29-70"><a href="#cb29-70" aria-hidden="true" tabindex="-1"></a><span class="co">    |   2   | Pullover    |</span></span>
+<span id="cb29-71"><a href="#cb29-71" aria-hidden="true" tabindex="-1"></a><span class="co">    |   3   | Dress       |</span></span>
+<span id="cb29-72"><a href="#cb29-72" aria-hidden="true" tabindex="-1"></a><span class="co">    |   4   | Coat        |</span></span>
+<span id="cb29-73"><a href="#cb29-73" aria-hidden="true" tabindex="-1"></a><span class="co">    |   5   | Sandal      |</span></span>
+<span id="cb29-74"><a href="#cb29-74" aria-hidden="true" tabindex="-1"></a><span class="co">    |   6   | Shirt       |</span></span>
+<span id="cb29-75"><a href="#cb29-75" aria-hidden="true" tabindex="-1"></a><span class="co">    |   7   | Sneaker     |</span></span>
+<span id="cb29-76"><a href="#cb29-76" aria-hidden="true" tabindex="-1"></a><span class="co">    |   8   | Bag         |</span></span>
+<span id="cb29-77"><a href="#cb29-77" aria-hidden="true" tabindex="-1"></a><span class="co">    |   9   | Ankle boot  |</span></span>
+<span id="cb29-78"><a href="#cb29-78" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-79"><a href="#cb29-79" aria-hidden="true" tabindex="-1"></a><span class="co">    Returns:</span></span>
+<span id="cb29-80"><a href="#cb29-80" aria-hidden="true" tabindex="-1"></a><span class="co">      Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.</span></span>
+<span id="cb29-81"><a href="#cb29-81" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-82"><a href="#cb29-82" aria-hidden="true" tabindex="-1"></a><span class="co">    **x_train**: uint8 NumPy array of grayscale image data with shapes</span></span>
+<span id="cb29-83"><a href="#cb29-83" aria-hidden="true" tabindex="-1"></a><span class="co">      `(60000, 28, 28)`, containing the training data.</span></span>
+<span id="cb29-84"><a href="#cb29-84" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-85"><a href="#cb29-85" aria-hidden="true" tabindex="-1"></a><span class="co">    **y_train**: uint8 NumPy array of labels (integers in range 0-9)</span></span>
+<span id="cb29-86"><a href="#cb29-86" aria-hidden="true" tabindex="-1"></a><span class="co">      with shape `(60000,)` for the training data.</span></span>
+<span id="cb29-87"><a href="#cb29-87" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-88"><a href="#cb29-88" aria-hidden="true" tabindex="-1"></a><span class="co">    **x_test**: uint8 NumPy array of grayscale image data with shapes</span></span>
+<span id="cb29-89"><a href="#cb29-89" aria-hidden="true" tabindex="-1"></a><span class="co">      (10000, 28, 28), containing the test data.</span></span>
+<span id="cb29-90"><a href="#cb29-90" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-91"><a href="#cb29-91" aria-hidden="true" tabindex="-1"></a><span class="co">    **y_test**: uint8 NumPy array of labels (integers in range 0-9)</span></span>
+<span id="cb29-92"><a href="#cb29-92" aria-hidden="true" tabindex="-1"></a><span class="co">      with shape `(10000,)` for the test data.</span></span>
+<span id="cb29-93"><a href="#cb29-93" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-94"><a href="#cb29-94" aria-hidden="true" tabindex="-1"></a><span class="co">    Example:</span></span>
+<span id="cb29-95"><a href="#cb29-95" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-96"><a href="#cb29-96" aria-hidden="true" tabindex="-1"></a><span class="co">    (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()</span></span>
+<span id="cb29-97"><a href="#cb29-97" aria-hidden="true" tabindex="-1"></a><span class="co">    assert x_train.shape == (60000, 28, 28)</span></span>
+<span id="cb29-98"><a href="#cb29-98" aria-hidden="true" tabindex="-1"></a><span class="co">    assert x_test.shape == (10000, 28, 28)</span></span>
+<span id="cb29-99"><a href="#cb29-99" aria-hidden="true" tabindex="-1"></a><span class="co">    assert y_train.shape == (60000,)</span></span>
+<span id="cb29-100"><a href="#cb29-100" aria-hidden="true" tabindex="-1"></a><span class="co">    assert y_test.shape == (10000,)</span></span>
+<span id="cb29-101"><a href="#cb29-101" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-102"><a href="#cb29-102" aria-hidden="true" tabindex="-1"></a><span class="co">    License:</span></span>
+<span id="cb29-103"><a href="#cb29-103" aria-hidden="true" tabindex="-1"></a><span class="co">      The copyright for Fashion-MNIST is held by Zalando SE.</span></span>
+<span id="cb29-104"><a href="#cb29-104" aria-hidden="true" tabindex="-1"></a><span class="co">      Fashion-MNIST is licensed under the [MIT license](</span></span>
+<span id="cb29-105"><a href="#cb29-105" aria-hidden="true" tabindex="-1"></a><span class="co">      https://github.com/zalandoresearch/fashion-mnist/blob/master/LICENSE).</span></span>
+<span id="cb29-106"><a href="#cb29-106" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-107"><a href="#cb29-107" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
+<span id="cb29-108"><a href="#cb29-108" aria-hidden="true" tabindex="-1"></a>    dirname <span class="op">=</span> os.path.join(<span class="st">"datasets"</span>, <span class="st">"fashion-mnist"</span>)</span>
+<span id="cb29-109"><a href="#cb29-109" aria-hidden="true" tabindex="-1"></a>    base <span class="op">=</span> <span class="st">"https://storage.googleapis.com/tensorflow/tf-keras-datasets/"</span></span>
+<span id="cb29-110"><a href="#cb29-110" aria-hidden="true" tabindex="-1"></a>    files <span class="op">=</span> [</span>
+<span id="cb29-111"><a href="#cb29-111" aria-hidden="true" tabindex="-1"></a>        <span class="st">"train-labels-idx1-ubyte.gz"</span>,</span>
+<span id="cb29-112"><a href="#cb29-112" aria-hidden="true" tabindex="-1"></a>        <span class="st">"train-images-idx3-ubyte.gz"</span>,</span>
+<span id="cb29-113"><a href="#cb29-113" aria-hidden="true" tabindex="-1"></a>        <span class="st">"t10k-labels-idx1-ubyte.gz"</span>,</span>
+<span id="cb29-114"><a href="#cb29-114" aria-hidden="true" tabindex="-1"></a>        <span class="st">"t10k-images-idx3-ubyte.gz"</span>,</span>
+<span id="cb29-115"><a href="#cb29-115" aria-hidden="true" tabindex="-1"></a>    ]</span>
+<span id="cb29-116"><a href="#cb29-116" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-117"><a href="#cb29-117" aria-hidden="true" tabindex="-1"></a>    paths <span class="op">=</span> []</span>
+<span id="cb29-118"><a href="#cb29-118" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> fname <span class="kw">in</span> files:</span>
+<span id="cb29-119"><a href="#cb29-119" aria-hidden="true" tabindex="-1"></a>        paths.append(fetch_and_cache(base <span class="op">+</span> fname, fname))</span>
+<span id="cb29-120"><a href="#cb29-120" aria-hidden="true" tabindex="-1"></a>        <span class="co"># paths.append(get_file(fname, origin=base + fname, cache_subdir=dirname))</span></span>
+<span id="cb29-121"><a href="#cb29-121" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-122"><a href="#cb29-122" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> gzip.<span class="bu">open</span>(paths[<span class="dv">0</span>], <span class="st">"rb"</span>) <span class="im">as</span> lbpath:</span>
+<span id="cb29-123"><a href="#cb29-123" aria-hidden="true" tabindex="-1"></a>        y_train <span class="op">=</span> np.frombuffer(lbpath.read(), np.uint8, offset<span class="op">=</span><span class="dv">8</span>)</span>
+<span id="cb29-124"><a href="#cb29-124" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-125"><a href="#cb29-125" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> gzip.<span class="bu">open</span>(paths[<span class="dv">1</span>], <span class="st">"rb"</span>) <span class="im">as</span> imgpath:</span>
+<span id="cb29-126"><a href="#cb29-126" aria-hidden="true" tabindex="-1"></a>        x_train <span class="op">=</span> np.frombuffer(imgpath.read(), np.uint8, offset<span class="op">=</span><span class="dv">16</span>).reshape(</span>
+<span id="cb29-127"><a href="#cb29-127" aria-hidden="true" tabindex="-1"></a>            <span class="bu">len</span>(y_train), <span class="dv">28</span>, <span class="dv">28</span></span>
+<span id="cb29-128"><a href="#cb29-128" aria-hidden="true" tabindex="-1"></a>        )</span>
+<span id="cb29-129"><a href="#cb29-129" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-130"><a href="#cb29-130" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> gzip.<span class="bu">open</span>(paths[<span class="dv">2</span>], <span class="st">"rb"</span>) <span class="im">as</span> lbpath:</span>
+<span id="cb29-131"><a href="#cb29-131" aria-hidden="true" tabindex="-1"></a>        y_test <span class="op">=</span> np.frombuffer(lbpath.read(), np.uint8, offset<span class="op">=</span><span class="dv">8</span>)</span>
+<span id="cb29-132"><a href="#cb29-132" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-133"><a href="#cb29-133" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> gzip.<span class="bu">open</span>(paths[<span class="dv">3</span>], <span class="st">"rb"</span>) <span class="im">as</span> imgpath:</span>
+<span id="cb29-134"><a href="#cb29-134" aria-hidden="true" tabindex="-1"></a>        x_test <span class="op">=</span> np.frombuffer(imgpath.read(), np.uint8, offset<span class="op">=</span><span class="dv">16</span>).reshape(</span>
+<span id="cb29-135"><a href="#cb29-135" aria-hidden="true" tabindex="-1"></a>            <span class="bu">len</span>(y_test), <span class="dv">28</span>, <span class="dv">28</span></span>
+<span id="cb29-136"><a href="#cb29-136" aria-hidden="true" tabindex="-1"></a>        )</span>
+<span id="cb29-137"><a href="#cb29-137" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb29-138"><a href="#cb29-138" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (x_train, y_train), (x_test, y_test)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="a26a4210" class="cell" data-execution_count="24">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a>class_names <span class="op">=</span> [</span>
+<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>    <span class="st">"T-shirt/top"</span>,</span>
+<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Trouser"</span>,</span>
+<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Pullover"</span>,</span>
+<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Dress"</span>,</span>
+<span id="cb30-6"><a href="#cb30-6" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Coat"</span>,</span>
+<span id="cb30-7"><a href="#cb30-7" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Sandal"</span>,</span>
+<span id="cb30-8"><a href="#cb30-8" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Shirt"</span>,</span>
+<span id="cb30-9"><a href="#cb30-9" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Sneaker"</span>,</span>
+<span id="cb30-10"><a href="#cb30-10" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Bag"</span>,</span>
+<span id="cb30-11"><a href="#cb30-11" aria-hidden="true" tabindex="-1"></a>    <span class="st">"Ankle boot"</span>,</span>
+<span id="cb30-12"><a href="#cb30-12" aria-hidden="true" tabindex="-1"></a>]</span>
+<span id="cb30-13"><a href="#cb30-13" aria-hidden="true" tabindex="-1"></a>class_dict <span class="op">=</span> {i: class_name <span class="cf">for</span> i, class_name <span class="kw">in</span> <span class="bu">enumerate</span>(class_names)}</span>
+<span id="cb30-14"><a href="#cb30-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb30-15"><a href="#cb30-15" aria-hidden="true" tabindex="-1"></a>(train_images, train_labels), (test_images, test_labels) <span class="op">=</span> load_data()</span>
+<span id="cb30-16"><a href="#cb30-16" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"Training images"</span>, train_images.shape)</span>
+<span id="cb30-17"><a href="#cb30-17" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"Test images"</span>, test_images.shape)</span>
+<span id="cb30-18"><a href="#cb30-18" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb30-19"><a href="#cb30-19" aria-hidden="true" tabindex="-1"></a>rng <span class="op">=</span> np.random.default_rng(<span class="dv">42</span>)</span>
+<span id="cb30-20"><a href="#cb30-20" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="dv">5000</span></span>
+<span id="cb30-21"><a href="#cb30-21" aria-hidden="true" tabindex="-1"></a>sample_idx <span class="op">=</span> rng.choice(np.arange(<span class="bu">len</span>(train_images)), size<span class="op">=</span>n, replace<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb30-22"><a href="#cb30-22" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb30-23"><a href="#cb30-23" aria-hidden="true" tabindex="-1"></a><span class="co"># Invert and normalize the images so they look better</span></span>
+<span id="cb30-24"><a href="#cb30-24" aria-hidden="true" tabindex="-1"></a>img_mat <span class="op">=</span> <span class="op">-</span><span class="dv">1</span> <span class="op">*</span> train_images[sample_idx].astype(np.int16)</span>
+<span id="cb30-25"><a href="#cb30-25" aria-hidden="true" tabindex="-1"></a>img_mat <span class="op">=</span> (img_mat <span class="op">-</span> img_mat.<span class="bu">min</span>()) <span class="op">/</span> (img_mat.<span class="bu">max</span>() <span class="op">-</span> img_mat.<span class="bu">min</span>())</span>
+<span id="cb30-26"><a href="#cb30-26" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb30-27"><a href="#cb30-27" aria-hidden="true" tabindex="-1"></a>images <span class="op">=</span> pd.DataFrame(</span>
+<span id="cb30-28"><a href="#cb30-28" aria-hidden="true" tabindex="-1"></a>    {</span>
+<span id="cb30-29"><a href="#cb30-29" aria-hidden="true" tabindex="-1"></a>        <span class="st">"images"</span>: img_mat.tolist(),</span>
+<span id="cb30-30"><a href="#cb30-30" aria-hidden="true" tabindex="-1"></a>        <span class="st">"labels"</span>: train_labels[sample_idx],</span>
+<span id="cb30-31"><a href="#cb30-31" aria-hidden="true" tabindex="-1"></a>        <span class="st">"class"</span>: [class_dict[x] <span class="cf">for</span> x <span class="kw">in</span> train_labels[sample_idx]],</span>
+<span id="cb30-32"><a href="#cb30-32" aria-hidden="true" tabindex="-1"></a>    }</span>
+<span id="cb30-33"><a href="#cb30-33" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>Using cached version that was downloaded (UTC): Tue Aug 27 03:33:08 2024
+Using cached version that was downloaded (UTC): Tue Aug 27 03:33:08 2024
+Using cached version that was downloaded (UTC): Tue Aug 27 03:33:08 2024
+Using cached version that was downloaded (UTC): Tue Aug 27 03:33:08 2024
+Training images (60000, 28, 28)
+Test images (10000, 28, 28)</code></pre>
+</div>
+</div>
+<p>Let’s see what some of the images contained in this dataset look like.</p>
+<div id="6da79238" class="cell" data-execution_count="25">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> show_images(images, ncols<span class="op">=</span><span class="dv">5</span>, max_images<span class="op">=</span><span class="dv">30</span>):</span>
+<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a>    <span class="co"># conver the subset of images into a n,28,28 matrix for facet visualization</span></span>
+<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a>    img_mat <span class="op">=</span> np.array(images.head(max_images)[<span class="st">"images"</span>].to_list())</span>
+<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a>    fig <span class="op">=</span> px.imshow(</span>
+<span id="cb32-5"><a href="#cb32-5" aria-hidden="true" tabindex="-1"></a>        img_mat,</span>
+<span id="cb32-6"><a href="#cb32-6" aria-hidden="true" tabindex="-1"></a>        color_continuous_scale<span class="op">=</span><span class="st">"gray"</span>,</span>
+<span id="cb32-7"><a href="#cb32-7" aria-hidden="true" tabindex="-1"></a>        facet_col<span class="op">=</span><span class="dv">0</span>,</span>
+<span id="cb32-8"><a href="#cb32-8" aria-hidden="true" tabindex="-1"></a>        facet_col_wrap<span class="op">=</span>ncols,</span>
+<span id="cb32-9"><a href="#cb32-9" aria-hidden="true" tabindex="-1"></a>        height<span class="op">=</span><span class="dv">220</span> <span class="op">*</span> <span class="bu">int</span>(np.ceil(<span class="bu">len</span>(images) <span class="op">/</span> ncols)),</span>
+<span id="cb32-10"><a href="#cb32-10" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb32-11"><a href="#cb32-11" aria-hidden="true" tabindex="-1"></a>    fig.update_layout(coloraxis_showscale<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb32-12"><a href="#cb32-12" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Extract the facet number and convert it back to the class label.</span></span>
+<span id="cb32-13"><a href="#cb32-13" aria-hidden="true" tabindex="-1"></a>    fig.for_each_annotation(</span>
+<span id="cb32-14"><a href="#cb32-14" aria-hidden="true" tabindex="-1"></a>        <span class="kw">lambda</span> a: a.update(text<span class="op">=</span>images.iloc[<span class="bu">int</span>(a.text.split(<span class="st">"="</span>)[<span class="op">-</span><span class="dv">1</span>])][<span class="st">"class"</span>])</span>
+<span id="cb32-15"><a href="#cb32-15" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb32-16"><a href="#cb32-16" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> fig</span>
+<span id="cb32-17"><a href="#cb32-17" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb32-18"><a href="#cb32-18" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb32-19"><a href="#cb32-19" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> show_images(images.groupby(<span class="st">"class"</span>, as_index<span class="op">=</span><span class="va">False</span>).sample(<span class="dv">2</span>), ncols<span class="op">=</span><span class="dv">6</span>)</span>
+<span id="cb32-20"><a href="#cb32-20" aria-hidden="true" tabindex="-1"></a>fig.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>                            <div id="e9b2838d-650f-455a-8915-9b8a4f9202f0" class="plotly-graph-div" style="height:880px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("e9b2838d-650f-455a-8915-9b8a4f9202f0")) {                    Plotly.newPlot(                        "e9b2838d-650f-455a-8915-9b8a4f9202f0",                        [{"coloraxis":"coloraxis","name":"0","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.8196078431372549,0.26666666666666666,0.2980392156862745,0.396078431372549,0.5764705882352941,0.792156862745098,0.9529411764705882,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.6784313725490196,0.2,0.23921568627450981,0.2235294117647059,0.1411764705882353,0.09019607843137255,0.10588235294117647,0.2235294117647059,0.47843137254901963,0.8901960784313725,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.5450980392156862,0.16470588235294117,0.23529411764705882,0.21176470588235294,0.2196078431372549,0.21176470588235294,0.19215686274509805,0.1450980392156863,0.10196078431372549,0.09803921568627451,0.28627450980392155,0.5686274509803921,0.9450980392156862,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.5215686274509804,0.17254901960784313,0.24313725490196078,0.23529411764705882,0.1803921568627451,0.24705882352941178,0.27450980392156865,0.19607843137254902,0.19215686274509805,0.24313725490196078,0.12549019607843137,0.12549019607843137,0.1411764705882353,0.48627450980392156],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.5647058823529412,0.22745098039215686,0.23529411764705882,0.2196078431372549,0.22745098039215686,0.25882352941176473,0.2823529411764706,0.19215686274509805,0.10980392156862745,0.30196078431372547,0.2235294117647059,0.43529411764705883,0.2235294117647059,0.3843137254901961],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.615686274509804,0.27058823529411763,0.24705882352941178,0.21176470588235294,0.20392156862745098,0.21176470588235294,0.18823529411764706,0.1568627450980392,0.15294117647058825,0.19607843137254902,0.00392156862745098,0.5137254901960784,0.25098039215686274,0.6666666666666666],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.5764705882352941,0.20784313725490197,0.21568627450980393,0.1803921568627451,0.19215686274509805,0.15294117647058825,0.08235294117647059,0.07450980392156863,0.45098039215686275,0.2823529411764706,0.2627450980392157,0.6705882352941176,0.0,0.7764705882352941],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.5686274509803921,0.1803921568627451,0.22745098039215686,0.17254901960784313,0.15294117647058825,0.11764705882352941,0.10980392156862745,0.11372549019607843,0.09019607843137255,0.09019607843137255,0.3176470588235294,0.3411764705882353,0.19215686274509805,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5019607843137255,0.19607843137254902,0.19215686274509805,0.1607843137254902,0.1607843137254902,0.1568627450980392,0.15294117647058825,0.1607843137254902,0.1411764705882353,0.10588235294117647,0.3568627450980392,0.12549019607843137,0.8156862745098039,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.32941176470588235,0.17254901960784313,0.2,0.1843137254901961,0.15294117647058825,0.21176470588235294,0.1843137254901961,0.2196078431372549,0.1843137254901961,0.2196078431372549,0.49019607843137253,0.17647058823529413,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8666666666666667,0.23921568627450981,0.21176470588235294,0.1568627450980392,0.16862745098039217,0.2235294117647059,0.16470588235294117,0.17647058823529413,0.2,0.20392156862745098,0.19607843137254902,0.11372549019607843,0.4,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.803921568627451,0.13725490196078433,0.2901960784313726,0.23921568627450981,0.2196078431372549,0.09803921568627451,0.10196078431372549,0.19607843137254902,0.16862745098039217,0.1411764705882353,0.15294117647058825,0.21176470588235294,0.1843137254901961,0.7058823529411765,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.32941176470588235,0.2980392156862745,0.25882352941176473,0.20392156862745098,0.25098039215686274,0.43529411764705883,0.21176470588235294,0.11764705882352941,0.21176470588235294,0.1568627450980392,0.19607843137254902,0.2,0.19215686274509805,0.596078431372549,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,0.7607843137254902,0.1843137254901961,0.2980392156862745,0.2627450980392157,0.22745098039215686,0.06274509803921569,0.050980392156862744,0.2,0.23921568627450981,0.14901960784313725,0.21176470588235294,0.16862745098039217,0.24705882352941178,0.16862745098039217,0.4392156862745098,1.0,1.0],[1.0,0.9882352941176471,0.9803921568627451,0.9686274509803922,0.9803921568627451,1.0,1.0,1.0,1.0,0.9215686274509803,0.5019607843137255,0.16470588235294117,0.29411764705882354,0.17647058823529413,0.13333333333333333,0.23137254901960785,0.36470588235294116,0.16862745098039217,0.23921568627450981,0.2784313725490196,0.24313725490196078,0.1450980392156863,0.19607843137254902,0.2196078431372549,0.1607843137254902,0.3137254901960784,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7490196078431373,0.42745098039215684,0.22745098039215686,0.28627450980392155,0.2901960784313726,0.34901960784313724,0.22745098039215686,0.06666666666666667,0.050980392156862744,0.12941176470588237,0.054901960784313725,0.19607843137254902,0.23137254901960785,0.20784313725490197,0.17254901960784313,0.2549019607843137,0.15294117647058825,0.20392156862745098,0.15294117647058825,0.8941176470588236,1.0],[1.0,1.0,0.7607843137254902,0.7019607843137254,0.6274509803921569,0.4823529411764706,0.26666666666666666,0.3137254901960784,0.3176470588235294,0.32941176470588235,0.3254901960784314,0.30196078431372547,0.34509803921568627,0.3803921568627451,0.2549019607843137,0.2784313725490196,0.2627450980392157,0.23529411764705882,0.0784313725490196,0.06666666666666667,0.10980392156862745,0.2627450980392157,0.21176470588235294,0.18823529411764706,0.18823529411764706,0.11764705882352941,0.6352941176470588,1.0],[0.9764705882352941,0.403921568627451,0.2235294117647059,0.2627450980392157,0.27450980392156865,0.2823529411764706,0.2980392156862745,0.3411764705882353,0.3254901960784314,0.3058823529411765,0.2627450980392157,0.27058823529411763,0.23529411764705882,0.26666666666666666,0.27450980392156865,0.24705882352941178,0.23921568627450981,0.2235294117647059,0.23921568627450981,0.20784313725490197,0.21176470588235294,0.19215686274509805,0.16470588235294117,0.2,0.19215686274509805,0.12156862745098039,0.45098039215686275,1.0],[0.6,0.32941176470588235,0.30196078431372547,0.2823529411764706,0.28627450980392155,0.3137254901960784,0.3333333333333333,0.30980392156862746,0.3176470588235294,0.32941176470588235,0.29411764705882354,0.2627450980392157,0.2549019607843137,0.20784313725490197,0.17254901960784313,0.2,0.2196078431372549,0.23529411764705882,0.20784313725490197,0.19607843137254902,0.23529411764705882,0.1450980392156863,0.17647058823529413,0.23137254901960785,0.21568627450980393,0.1411764705882353,0.4392156862745098,1.0],[0.49411764705882355,0.4392156862745098,0.44313725490196076,0.35294117647058826,0.3058823529411765,0.27450980392156865,0.23137254901960785,0.2196078431372549,0.26666666666666666,0.2823529411764706,0.24705882352941178,0.22745098039215686,0.22745098039215686,0.17647058823529413,0.1607843137254902,0.1607843137254902,0.15294117647058825,0.1568627450980392,0.1607843137254902,0.18823529411764706,0.19215686274509805,0.11372549019607843,0.14901960784313725,0.1843137254901961,0.19215686274509805,0.16862745098039217,0.45098039215686275,1.0],[0.7725490196078432,0.39215686274509803,0.396078431372549,0.42745098039215684,0.43137254901960786,0.40784313725490196,0.37254901960784315,0.37254901960784315,0.3058823529411765,0.2823529411764706,0.24705882352941178,0.2235294117647059,0.2235294117647059,0.2196078431372549,0.22745098039215686,0.24705882352941178,0.25098039215686274,0.21568627450980393,0.19607843137254902,0.23921568627450981,0.23921568627450981,0.23529411764705882,0.26666666666666666,0.3333333333333333,0.34901960784313724,0.38823529411764707,0.5607843137254902,1.0],[1.0,0.8274509803921568,0.4666666666666667,0.403921568627451,0.3254901960784314,0.30980392156862746,0.3411764705882353,0.3803921568627451,0.3843137254901961,0.3843137254901961,0.3686274509803922,0.3803921568627451,0.43529411764705883,0.4823529411764706,0.47058823529411764,0.4745098039215686,0.47843137254901963,0.4627450980392157,0.49019607843137253,0.4588235294117647,0.4588235294117647,0.47843137254901963,0.4627450980392157,0.39215686274509803,0.3803921568627451,0.3607843137254902,0.3568627450980392,1.0],[1.0,1.0,1.0,1.0,0.8549019607843137,0.592156862745098,0.3803921568627451,0.3137254901960784,0.28627450980392155,0.23137254901960785,0.23137254901960785,0.23137254901960785,0.2784313725490196,0.24705882352941178,0.24705882352941178,0.23921568627450981,0.23137254901960785,0.2549019607843137,0.24705882352941178,0.26666666666666666,0.24313725490196078,0.27058823529411763,0.2196078431372549,0.25098039215686274,0.396078431372549,0.4117647058823529,0.7450980392156863,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9215686274509803,0.9333333333333333,0.8823529411764706,0.8941176470588236,0.8627450980392157,0.9254901960784314,0.9058823529411765,0.8705882352941177,0.8862745098039215,0.7568627450980392,0.8784313725490196,0.7411764705882353,0.8196078431372549,0.8,0.8666666666666667,0.9647058823529412,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x19","yaxis":"y19","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"1","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.803921568627451,0.4627450980392157,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8901960784313725,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.43137254901960786,0.8549019607843137,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.8901960784313725,0.2196078431372549,0.8392156862745098,1.0,1.0,1.0,1.0,0.9372549019607843,0.32941176470588235,0.20784313725490197,0.9176470588235294,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.996078431372549,1.0,0.9803921568627451,1.0,1.0,0.5333333333333333,0.3333333333333333,0.2901960784313726,0.36470588235294116,0.2980392156862745,0.3137254901960784,0.3568627450980392,0.23529411764705882,0.08235294117647059,0.19215686274509805,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.996078431372549,1.0,0.7450980392156863,0.3764705882352941,0.2901960784313726,0.0784313725490196,0.09803921568627451,0.09411764705882353,0.12941176470588237,0.12941176470588237,0.17254901960784313,0.15294117647058825,0.3254901960784314,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9647058823529412,0.3254901960784314,0.4117647058823529,0.2235294117647059,0.10980392156862745,0.13333333333333333,0.1450980392156863,0.1803921568627451,0.16470588235294117,0.16470588235294117,0.10196078431372549,0.49019607843137253,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9921568627450981,1.0,1.0,0.47058823529411764,0.34509803921568627,0.2980392156862745,0.1568627450980392,0.09803921568627451,0.13725490196078433,0.12156862745098039,0.14901960784313725,0.10196078431372549,0.17254901960784313,0.10980392156862745,0.4196078431372549,1.0,1.0],[1.0,1.0,1.0,0.996078431372549,0.996078431372549,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.5725490196078431,0.2549019607843137,0.29411764705882354,0.25098039215686274,0.10196078431372549,0.10196078431372549,0.1450980392156863,0.09411764705882353,0.16470588235294117,0.08627450980392157,0.17254901960784313,0.13333333333333333,0.16862745098039217,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,0.6588235294117647,0.28627450980392155,0.2784313725490196,0.22745098039215686,0.21568627450980393,0.08627450980392157,0.13725490196078433,0.13725490196078433,0.13725490196078433,0.17254901960784313,0.10980392156862745,0.11764705882352941,0.14901960784313725,0.011764705882352941,1.0,1.0],[1.0,0.996078431372549,0.9921568627450981,0.9882352941176471,0.9882352941176471,0.9882352941176471,0.9803921568627451,1.0,1.0,1.0,1.0,0.611764705882353,0.39215686274509803,0.4,0.28627450980392155,0.27058823529411763,0.16470588235294117,0.07450980392156863,0.17254901960784313,0.13725490196078433,0.16862745098039217,0.16470588235294117,0.11764705882352941,0.07450980392156863,0.1568627450980392,0.06666666666666667,0.8666666666666667,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7843137254901961,0.4470588235294118,0.3607843137254902,0.3568627450980392,0.28627450980392155,0.20784313725490197,0.2196078431372549,0.13333333333333333,0.09803921568627451,0.1843137254901961,0.13725490196078433,0.13725490196078433,0.1450980392156863,0.08235294117647059,0.0784313725490196,0.1803921568627451,0.06666666666666667,0.6196078431372549,1.0],[1.0,1.0,0.9294117647058824,0.7490196078431373,0.7098039215686275,0.8,0.8352941176470589,0.6745098039215687,0.4549019607843137,0.3803921568627451,0.3764705882352941,0.3215686274509804,0.25882352941176473,0.2,0.15294117647058825,0.16862745098039217,0.10196078431372549,0.09411764705882353,0.1803921568627451,0.1450980392156863,0.14901960784313725,0.1450980392156863,0.06666666666666667,0.1450980392156863,0.2235294117647059,0.10980392156862745,0.3215686274509804,1.0],[1.0,0.8823529411764706,0.36470588235294116,0.4196078431372549,0.39215686274509803,0.36470588235294116,0.3843137254901961,0.3764705882352941,0.40784313725490196,0.44313725490196076,0.36470588235294116,0.2784313725490196,0.27450980392156865,0.22745098039215686,0.17254901960784313,0.1843137254901961,0.1450980392156863,0.0784313725490196,0.12156862745098039,0.10980392156862745,0.11372549019607843,0.09411764705882353,0.06666666666666667,0.14901960784313725,0.18823529411764706,0.16470588235294117,0.17254901960784313,1.0],[1.0,0.6470588235294118,0.36470588235294116,0.40784313725490196,0.3607843137254902,0.39215686274509803,0.39215686274509803,0.36470588235294116,0.3215686274509804,0.2784313725490196,0.25098039215686274,0.2549019607843137,0.21568627450980393,0.16862745098039217,0.14901960784313725,0.10980392156862745,0.13725490196078433,0.15294117647058825,0.10980392156862745,0.12941176470588237,0.0784313725490196,0.12941176470588237,0.1843137254901961,0.16470588235294117,0.27058823529411763,0.18823529411764706,0.1450980392156863,1.0],[0.9450980392156862,0.41568627450980394,0.21568627450980393,0.1803921568627451,0.18823529411764706,0.23921568627450981,0.2784313725490196,0.29411764705882354,0.23529411764705882,0.22745098039215686,0.20784313725490197,0.2196078431372549,0.16862745098039217,0.11372549019607843,0.08235294117647059,0.023529411764705882,0.0392156862745098,0.0784313725490196,0.0,0.03137254901960784,0.0784313725490196,0.06666666666666667,0.047058823529411764,0.06666666666666667,0.13725490196078433,0.15294117647058825,0.2784313725490196,0.996078431372549],[0.8,0.43137254901960786,0.39215686274509803,0.27058823529411763,0.21568627450980393,0.16470588235294117,0.14901960784313725,0.13725490196078433,0.10196078431372549,0.09803921568627451,0.09411764705882353,0.07450980392156863,0.06274509803921569,0.058823529411764705,0.06666666666666667,0.09411764705882353,0.13333333333333333,0.10980392156862745,0.1568627450980392,0.19215686274509805,0.2549019607843137,0.2627450980392157,0.3137254901960784,0.34509803921568627,0.32941176470588235,0.43137254901960786,0.40784313725490196,0.7294117647058823],[1.0,0.8705882352941177,0.6666666666666666,0.49019607843137253,0.42745098039215684,0.36470588235294116,0.3254901960784314,0.2901960784313726,0.28627450980392155,0.2901960784313726,0.2784313725490196,0.27058823529411763,0.27450980392156865,0.2980392156862745,0.34901960784313724,0.40784313725490196,0.40784313725490196,0.41568627450980394,0.4549019607843137,0.41568627450980394,0.4117647058823529,0.4,0.42745098039215684,0.3803921568627451,0.4117647058823529,0.45098039215686275,0.4196078431372549,0.7843137254901961],[1.0,1.0,1.0,0.9882352941176471,0.9098039215686274,0.8470588235294118,0.7686274509803922,0.7058823529411765,0.6666666666666666,0.6392156862745098,0.6392156862745098,0.6431372549019608,0.6470588235294118,0.592156862745098,0.6274509803921569,0.6980392156862745,0.7098039215686275,0.6823529411764706,0.5607843137254902,0.44313725490196076,0.4666666666666667,0.5254901960784314,0.5568627450980392,0.5607843137254902,0.6274509803921569,0.7098039215686275,0.8156862745098039,0.9803921568627451],[1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.996078431372549,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x20","yaxis":"y20","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"2","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.4980392156862745,0.6431372549019608,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.6549019607843137,1.0,0.6078431372549019,0.8941176470588236,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.40784313725490196,1.0,0.5803921568627451,0.6980392156862745,1.0,0.9764705882352941,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.8941176470588236,0.21568627450980393,1.0,0.7254901960784313,0.39215686274509803,1.0,0.9764705882352941,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9764705882352941,1.0,0.6666666666666666,0.3607843137254902,1.0,1.0,0.43137254901960786,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9647058823529412,1.0,0.5490196078431373,0.596078431372549,1.0,1.0,0.3568627450980392,0.9176470588235294,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.42745098039215684,0.8,1.0,1.0,0.4235294117647059,0.6235294117647059,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9764705882352941,0.984313725490196,0.9921568627450981,1.0,1.0,0.10980392156862745,1.0,1.0,1.0,0.5294117647058824,0.1411764705882353,1.0,1.0,0.984313725490196,0.984313725490196,0.9764705882352941,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,0.3568627450980392,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.6941176470588235,0.5254901960784314,0.6705882352941176,0.8470588235294118,0.4666666666666667,0.13333333333333333,0.6352941176470588,0.6078431372549019,0.5843137254901961,0.6039215686274509,0.0,0.5803921568627451,0.8156862745098039,0.5215686274509804,0.403921568627451,0.8313725490196079,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.5843137254901961,0.07450980392156863,0.09411764705882353,0.13333333333333333,0.10196078431372549,0.35294117647058826,0.17647058823529413,0.15294117647058825,0.08627450980392157,0.09411764705882353,0.09019607843137255,0.3607843137254902,0.18823529411764706,0.043137254901960784,0.06666666666666667,0.06274509803921569,0.027450980392156862,0.4980392156862745,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.2627450980392157,0.10980392156862745,0.19215686274509805,0.2196078431372549,0.15294117647058825,0.403921568627451,0.2196078431372549,0.18823529411764706,0.17647058823529413,0.19215686274509805,0.12549019607843137,0.3137254901960784,0.2627450980392157,0.12156862745098039,0.1450980392156863,0.16862745098039217,0.07058823529411765,0.24313725490196078,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.8,0.09803921568627451,0.1450980392156863,0.2196078431372549,0.23137254901960785,0.27058823529411763,0.19215686274509805,0.16862745098039217,0.19607843137254902,0.22745098039215686,0.25882352941176473,0.21176470588235294,0.09803921568627451,0.08627450980392157,0.11372549019607843,0.13333333333333333,0.1411764705882353,0.08627450980392157,0.043137254901960784,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.23921568627450981,0.08627450980392157,0.27058823529411763,0.1450980392156863,0.21568627450980393,0.27058823529411763,0.24705882352941178,0.2196078431372549,0.22745098039215686,0.21568627450980393,0.27058823529411763,0.3215686274509804,0.19215686274509805,0.2823529411764706,0.19215686274509805,0.15294117647058825,0.10588235294117647,0.12156862745098039,0.08627450980392157,0.8784313725490196,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9058823529411765,0.22745098039215686,0.1607843137254902,0.20392156862745098,0.12156862745098039,0.25098039215686274,0.21568627450980393,0.2,0.23921568627450981,0.24705882352941178,0.17647058823529413,0.23137254901960785,0.48627450980392156,0.21176470588235294,0.25882352941176473,0.24705882352941178,0.1843137254901961,0.20392156862745098,0.12941176470588237,0.058823529411764705,0.5843137254901961,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6941176470588235,0.33725490196078434,0.09411764705882353,0.14901960784313725,0.24705882352941178,0.2823529411764706,0.21176470588235294,0.19215686274509805,0.27058823529411763,0.27450980392156865,0.16470588235294117,0.2,0.5568627450980392,0.23529411764705882,0.2,0.24705882352941178,0.3411764705882353,0.4,0.23137254901960785,0.10588235294117647,0.4627450980392157,1.0,1.0,1.0],[1.0,1.0,1.0,0.9294117647058824,0.5764705882352941,0.6980392156862745,0.09019607843137255,0.27058823529411763,0.3254901960784314,0.2980392156862745,0.2196078431372549,0.19607843137254902,0.28627450980392155,0.30196078431372547,0.14901960784313725,0.1607843137254902,0.5607843137254902,0.27450980392156865,0.16862745098039217,0.24313725490196078,0.29411764705882354,0.2784313725490196,0.2235294117647059,0.2,0.6666666666666666,1.0,1.0,1.0],[1.0,1.0,1.0,0.5450980392156862,0.8509803921568627,0.9568627450980393,0.0,0.39215686274509803,0.30980392156862746,0.30196078431372547,0.24313725490196078,0.20392156862745098,0.2901960784313726,0.30980392156862746,0.13725490196078433,0.13333333333333333,0.49019607843137253,0.2901960784313726,0.16862745098039217,0.20392156862745098,0.23137254901960785,0.22745098039215686,0.15294117647058825,0.23921568627450981,0.9450980392156862,1.0,1.0,1.0],[1.0,1.0,1.0,0.42745098039215684,1.0,1.0,0.0,0.36470588235294116,0.32941176470588235,0.3215686274509804,0.24313725490196078,0.23137254901960785,0.30980392156862746,0.30196078431372547,0.12941176470588237,0.12156862745098039,0.4117647058823529,0.27450980392156865,0.17647058823529413,0.16862745098039217,0.1843137254901961,0.16862745098039217,0.23921568627450981,0.29411764705882354,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.4392156862745098,1.0,1.0,0.00784313725490196,0.23137254901960785,0.36470588235294116,0.3568627450980392,0.2980392156862745,0.27058823529411763,0.2823529411764706,0.2627450980392157,0.1607843137254902,0.11372549019607843,0.3137254901960784,0.29411764705882354,0.2196078431372549,0.2,0.16470588235294117,0.1607843137254902,0.2549019607843137,0.25882352941176473,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.5098039215686274,0.6862745098039216,1.0,0.10980392156862745,0.15294117647058825,0.3686274509803922,0.35294117647058826,0.30196078431372547,0.2784313725490196,0.23137254901960785,0.20392156862745098,0.14901960784313725,0.10980392156862745,0.1568627450980392,0.24705882352941178,0.24313725490196078,0.20784313725490197,0.22745098039215686,0.18823529411764706,0.23529411764705882,0.24313725490196078,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.7764705882352941,0.4588235294117647,1.0,0.23529411764705882,0.09411764705882353,0.2784313725490196,0.2901960784313726,0.2784313725490196,0.25098039215686274,0.19607843137254902,0.1607843137254902,0.13333333333333333,0.13333333333333333,0.11764705882352941,0.11764705882352941,0.12941176470588237,0.13333333333333333,0.17254901960784313,0.20392156862745098,0.2196078431372549,0.3686274509803922,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.9215686274509803,0.2549019607843137,1.0,0.4745098039215686,0.09411764705882353,0.20784313725490197,0.23137254901960785,0.2235294117647059,0.18823529411764706,0.16470588235294117,0.14901960784313725,0.14901960784313725,0.14901960784313725,0.14901960784313725,0.15294117647058825,0.1411764705882353,0.1411764705882353,0.14901960784313725,0.18823529411764706,0.1843137254901961,0.5137254901960784,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.19215686274509805,1.0,0.6901960784313725,0.0,0.1803921568627451,0.2823529411764706,0.22745098039215686,0.1411764705882353,0.15294117647058825,0.15294117647058825,0.1450980392156863,0.1450980392156863,0.13725490196078433,0.12941176470588237,0.13333333333333333,0.1450980392156863,0.1568627450980392,0.19215686274509805,0.2,0.7333333333333333,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.2901960784313726,0.9019607843137255,1.0,0.0,0.10980392156862745,0.19215686274509805,0.1411764705882353,0.1450980392156863,0.16470588235294117,0.16862745098039217,0.1607843137254902,0.1568627450980392,0.1568627450980392,0.14901960784313725,0.15294117647058825,0.1450980392156863,0.17647058823529413,0.19215686274509805,0.22745098039215686,0.9882352941176471,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9529411764705882,0.43529411764705883,0.5098039215686274,0.09803921568627451,0.1411764705882353,0.12549019607843137,0.09803921568627451,0.10588235294117647,0.10196078431372549,0.13333333333333333,0.1411764705882353,0.13725490196078433,0.13725490196078433,0.13725490196078433,0.13333333333333333,0.12156862745098039,0.15294117647058825,0.2235294117647059,0.3137254901960784,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.9764705882352941,1.0,0.1607843137254902,0.09411764705882353,0.24313725490196078,0.0784313725490196,0.10980392156862745,0.09803921568627451,0.023529411764705882,0.15294117647058825,0.1450980392156863,0.10588235294117647,0.11372549019607843,0.11372549019607843,0.1411764705882353,0.12941176470588237,0.11764705882352941,0.24313725490196078,0.22745098039215686,0.6705882352941176,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.796078431372549,0.37254901960784315,0.30196078431372547,0.24705882352941178,0.2235294117647059,0.22745098039215686,0.3058823529411765,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x21","yaxis":"y21","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"3","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.792156862745098,0.5098039215686274,0.4588235294117647,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.984313725490196,0.3803921568627451,1.0,0.6627450980392157,0.4235294117647059,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.5882352941176471,0.7215686274509804,1.0,1.0,0.5215686274509804,0.996078431372549,1.0,0.9921568627450981,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.5372549019607843,0.7450980392156863,1.0,1.0,0.2823529411764706,0.6901960784313725,1.0,0.9725490196078431,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.9215686274509803,0.3568627450980392,0.792156862745098,1.0,1.0,0.5764705882352941,0.5490196078431373,1.0,0.9647058823529412,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.5843137254901961,0.27058823529411763,1.0,1.0,1.0,0.7450980392156863,0.48627450980392156,1.0,0.9725490196078431,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.36470588235294116,0.5294117647058824,1.0,0.984313725490196,1.0,0.8313725490196079,0.43137254901960786,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,0.20784313725490197,0.7647058823529411,1.0,0.9647058823529412,1.0,0.8980392156862745,0.4392156862745098,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8196078431372549,0.11372549019607843,0.8862745098039215,1.0,0.9764705882352941,1.0,0.9450980392156862,0.403921568627451,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.984313725490196,1.0,0.615686274509804,0.12549019607843137,0.996078431372549,1.0,0.9686274509803922,1.0,1.0,0.3607843137254902,1.0,1.0,0.9921568627450981,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,0.9921568627450981,0.9921568627450981,1.0,1.0,1.0,0.5137254901960784,0.2823529411764706,1.0,1.0,0.9647058823529412,1.0,1.0,0.027450980392156862,0.9254901960784314,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.9372549019607843,1.0,0.6392156862745098,0.5137254901960784,0.23137254901960785,1.0,1.0,1.0,1.0,1.0,0.03137254901960784,0.8117647058823529,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,0.30980392156862746,0.36470588235294116,0.22745098039215686,0.19607843137254902,0.28627450980392155,0.8352941176470589,0.7215686274509804,0.6745098039215687,0.30980392156862746,0.796078431372549,0.23529411764705882,0.5058823529411764,0.5019607843137255,0.3176470588235294,0.35294117647058826,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.4392156862745098,0.3607843137254902,0.41568627450980394,0.4627450980392157,0.45098039215686275,0.4627450980392157,0.3764705882352941,0.2784313725490196,0.18823529411764706,0.00392156862745098,0.1411764705882353,0.396078431372549,0.49411764705882355,0.38823529411764707,0.06666666666666667,0.23921568627450981,0.4627450980392157,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.49411764705882355,0.3803921568627451,0.3803921568627451,0.396078431372549,0.37254901960784315,0.4235294117647059,0.4745098039215686,0.4980392156862745,0.5882352941176471,0.34509803921568627,0.3176470588235294,0.4980392156862745,0.42745098039215684,0.3843137254901961,0.43137254901960786,0.4117647058823529,0.403921568627451,0.3215686274509804,0.5137254901960784,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7647058823529411,0.24705882352941178,0.40784313725490196,0.36470588235294116,0.396078431372549,0.37254901960784315,0.4627450980392157,0.4235294117647059,0.4549019607843137,0.5725490196078431,0.30980392156862746,0.3411764705882353,0.3843137254901961,0.35294117647058826,0.4666666666666667,0.2901960784313726,0.3607843137254902,0.6,0.5725490196078431,0.16470588235294117,0.7058823529411765,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.3137254901960784,0.3803921568627451,0.45098039215686275,0.38823529411764707,0.44313725490196076,0.3803921568627451,0.4470588235294118,0.41568627450980394,0.47058823529411764,0.49019607843137253,0.2980392156862745,0.27450980392156865,0.3607843137254902,0.35294117647058826,0.4549019607843137,0.44313725490196076,0.3215686274509804,0.3137254901960784,0.5019607843137255,0.4196078431372549,0.1803921568627451,0.9686274509803922,1.0,1.0],[1.0,1.0,1.0,0.807843137254902,0.24313725490196078,0.3686274509803922,0.44313725490196076,0.40784313725490196,0.3764705882352941,0.41568627450980394,0.45098039215686275,0.4549019607843137,0.47058823529411764,0.5019607843137255,0.5215686274509804,0.23529411764705882,0.29411764705882354,0.403921568627451,0.4196078431372549,0.4235294117647059,0.3843137254901961,0.44313725490196076,0.4235294117647059,0.5294117647058824,0.41568627450980394,0.6705882352941176,1.0,1.0],[1.0,1.0,1.0,0.807843137254902,0.26666666666666666,0.4235294117647059,0.43137254901960786,0.403921568627451,0.44313725490196076,0.41568627450980394,0.403921568627451,0.4196078431372549,0.5176470588235295,0.596078431372549,0.6235294117647059,0.6039215686274509,0.49411764705882355,0.4470588235294118,0.43137254901960786,0.4980392156862745,0.40784313725490196,0.41568627450980394,0.49411764705882355,0.5176470588235295,0.4,0.9450980392156862,1.0,1.0],[1.0,1.0,1.0,0.8117647058823529,0.3215686274509804,0.4196078431372549,0.4392156862745098,0.4117647058823529,0.3568627450980392,0.3215686274509804,0.38823529411764707,0.34509803921568627,0.3607843137254902,0.4627450980392157,0.4549019607843137,0.5137254901960784,0.6392156862745098,0.5490196078431373,0.4470588235294118,0.5372549019607843,0.5372549019607843,0.41568627450980394,0.5215686274509804,0.5215686274509804,0.4627450980392157,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.6784313725490196,0.30196078431372547,0.40784313725490196,0.38823529411764707,0.40784313725490196,0.4117647058823529,0.3803921568627451,0.3333333333333333,0.3411764705882353,0.32941176470588235,0.40784313725490196,0.4549019607843137,0.44313725490196076,0.4588235294117647,0.5647058823529412,0.49411764705882355,0.4980392156862745,0.5019607843137255,0.45098039215686275,0.5058823529411764,0.4470588235294118,0.47058823529411764,1.0,1.0,1.0],[1.0,1.0,1.0,0.6705882352941176,0.23137254901960785,0.41568627450980394,0.37254901960784315,0.3764705882352941,0.403921568627451,0.40784313725490196,0.3686274509803922,0.36470588235294116,0.33725490196078434,0.3843137254901961,0.3843137254901961,0.45098039215686275,0.4117647058823529,0.40784313725490196,0.43137254901960786,0.4980392156862745,0.5058823529411764,0.4823529411764706,0.5019607843137255,0.43137254901960786,0.5490196078431373,1.0,1.0,1.0],[1.0,1.0,1.0,0.7294117647058823,0.24313725490196078,0.3764705882352941,0.3803921568627451,0.35294117647058826,0.4196078431372549,0.4117647058823529,0.38823529411764707,0.396078431372549,0.37254901960784315,0.403921568627451,0.4,0.4666666666666667,0.45098039215686275,0.38823529411764707,0.38823529411764707,0.4,0.4470588235294118,0.45098039215686275,0.47058823529411764,0.44313725490196076,0.6941176470588235,1.0,1.0,1.0],[1.0,1.0,1.0,0.7490196078431373,0.16862745098039217,0.27450980392156865,0.33725490196078434,0.36470588235294116,0.3686274509803922,0.3333333333333333,0.3254901960784314,0.32941176470588235,0.3843137254901961,0.3176470588235294,0.19215686274509805,0.20784313725490197,0.1843137254901961,0.3333333333333333,0.4235294117647059,0.4,0.43137254901960786,0.4392156862745098,0.47058823529411764,0.41568627450980394,0.6509803921568628,1.0,1.0,1.0],[1.0,1.0,1.0,0.788235294117647,0.25098039215686274,0.3333333333333333,0.28627450980392155,0.3137254901960784,0.3333333333333333,0.3568627450980392,0.2980392156862745,0.3411764705882353,0.29411764705882354,0.08627450980392157,0.403921568627451,0.45098039215686275,0.4,0.12549019607843137,0.5058823529411764,0.45098039215686275,0.42745098039215684,0.37254901960784315,0.3686274509803922,0.41568627450980394,0.6627450980392157,1.0,1.0,1.0],[1.0,1.0,1.0,0.996078431372549,0.33725490196078434,0.4196078431372549,0.3607843137254902,0.2980392156862745,0.32941176470588235,0.3411764705882353,0.36470588235294116,0.3764705882352941,0.30980392156862746,0.17254901960784313,0.8588235294117647,0.8352941176470589,0.6313725490196078,0.13725490196078433,0.5372549019607843,0.4470588235294118,0.37254901960784315,0.43137254901960786,0.5372549019607843,0.48627450980392156,0.6509803921568628,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.3215686274509804,0.29411764705882354,0.28627450980392155,0.2980392156862745,0.3137254901960784,0.24705882352941178,0.30196078431372547,0.3176470588235294,0.2823529411764706,0.10196078431372549,0.1568627450980392,0.07058823529411765,0.09411764705882353,0.2235294117647059,0.4470588235294118,0.3607843137254902,0.32941176470588235,0.4392156862745098,0.34509803921568627,0.33725490196078434,0.41568627450980394,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7686274509803922,0.6078431372549019,0.5607843137254902,0.5294117647058824,0.5098039215686274,0.4549019607843137,0.4235294117647059,0.41568627450980394,0.3803921568627451,0.34509803921568627,0.30980392156862746,0.4627450980392157,0.5372549019607843,0.5176470588235295,0.45098039215686275,0.4980392156862745,0.5176470588235295,0.5764705882352941,0.6039215686274509,0.5725490196078431,0.792156862745098,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x22","yaxis":"y22","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"4","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7098039215686275,0.3176470588235294,0.33725490196078434,0.41568627450980394,0.20784313725490197,0.35294117647058826,0.49019607843137253,0.2980392156862745,0.3254901960784314,0.4470588235294118,0.5686274509803921,0.23921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.796078431372549,0.38823529411764707,0.5137254901960784,0.5058823529411764,0.5764705882352941,0.6,0.5882352941176471,0.5254901960784314,0.4196078431372549,0.4470588235294118,0.4980392156862745,0.25098039215686274,0.1568627450980392,0.9764705882352941,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.807843137254902,0.27450980392156865,0.3333333333333333,0.2901960784313726,0.3058823529411765,0.39215686274509803,0.3137254901960784,0.3137254901960784,0.40784313725490196,0.4117647058823529,0.23921568627450981,0.34509803921568627,0.42745098039215684,0.9058823529411765,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.43137254901960786,0.3176470588235294,0.20784313725490197,0.4,0.35294117647058826,0.4196078431372549,0.03137254901960784,0.2549019607843137,0.35294117647058826,0.48627450980392156,0.6078431372549019,0.38823529411764707,0.41568627450980394,0.4980392156862745,0.9490196078431372,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.40784313725490196,0.3764705882352941,0.6,0.396078431372549,0.27058823529411763,0.3137254901960784,0.5098039215686274,0.16470588235294117,0.23921568627450981,0.3764705882352941,0.32941176470588235,0.3843137254901961,0.3333333333333333,0.3764705882352941,0.38823529411764707,0.16862745098039217,0.9686274509803922,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.8352941176470589,0.26666666666666666,0.4588235294117647,0.49019607843137253,0.34901960784313724,0.25882352941176473,0.45098039215686275,0.39215686274509803,0.29411764705882354,0.2980392156862745,0.27450980392156865,0.4745098039215686,0.3568627450980392,0.38823529411764707,0.5058823529411764,0.38823529411764707,0.4117647058823529,0.6549019607843137,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6,0.25882352941176473,0.3843137254901961,0.4666666666666667,0.27450980392156865,0.3686274509803922,0.396078431372549,0.5294117647058824,0.35294117647058826,0.22745098039215686,0.39215686274509803,0.37254901960784315,0.41568627450980394,0.34509803921568627,0.43137254901960786,0.4745098039215686,0.3568627450980392,0.396078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.5490196078431373,0.35294117647058826,0.43529411764705883,0.4549019607843137,0.4666666666666667,0.5176470588235295,0.49411764705882355,0.5882352941176471,0.45098039215686275,0.27058823529411763,0.611764705882353,0.5764705882352941,0.5647058823529412,0.6274509803921569,0.5372549019607843,0.4196078431372549,0.33725490196078434,0.5686274509803921,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.2549019607843137,0.26666666666666666,0.32941176470588235,0.27058823529411763,0.27450980392156865,0.21176470588235294,0.4588235294117647,0.41568627450980394,0.34901960784313724,0.2549019607843137,0.39215686274509803,0.4117647058823529,0.3333333333333333,0.2901960784313726,0.4588235294117647,0.37254901960784315,0.35294117647058826,0.2980392156862745,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.35294117647058826,0.3843137254901961,0.3254901960784314,0.3764705882352941,0.5176470588235295,0.41568627450980394,0.3764705882352941,0.5725490196078431,0.3333333333333333,0.4196078431372549,0.49411764705882355,0.5764705882352941,0.6078431372549019,0.32941176470588235,0.5450980392156862,0.3568627450980392,0.36470588235294116,0.396078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9921568627450981,0.20784313725490197,0.3764705882352941,0.23137254901960785,0.34509803921568627,0.2980392156862745,0.4666666666666667,0.32941176470588235,0.5568627450980392,0.3137254901960784,0.32941176470588235,0.4392156862745098,0.3843137254901961,0.592156862745098,0.2901960784313726,0.5137254901960784,0.34509803921568627,0.40784313725490196,0.4980392156862745,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9058823529411765,0.39215686274509803,0.4588235294117647,0.11372549019607843,0.47843137254901963,0.3333333333333333,0.39215686274509803,0.5568627450980392,0.5568627450980392,0.43529411764705883,0.32941176470588235,0.6078431372549019,0.49019607843137253,0.396078431372549,0.5254901960784314,0.5490196078431373,0.40784313725490196,0.49019607843137253,0.27450980392156865,0.984313725490196,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7333333333333333,0.2901960784313726,0.6,0.1843137254901961,0.2549019607843137,0.4117647058823529,0.27450980392156865,0.39215686274509803,0.5529411764705883,0.32941176470588235,0.21176470588235294,0.4,0.47058823529411764,0.3568627450980392,0.5176470588235295,0.43137254901960786,0.33725490196078434,0.43529411764705883,0.2196078431372549,0.8666666666666667,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6078431372549019,0.2196078431372549,0.42745098039215684,0.058823529411764705,0.19607843137254902,0.2784313725490196,0.23921568627450981,0.25098039215686274,0.5725490196078431,0.42745098039215684,0.32941176470588235,0.3176470588235294,0.3843137254901961,0.2196078431372549,0.47058823529411764,0.47843137254901963,0.33725490196078434,0.34509803921568627,0.1450980392156863,0.796078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7686274509803922,0.4588235294117647,0.6274509803921569,0.45098039215686275,0.054901960784313725,0.36470588235294116,0.5568627450980392,0.41568627450980394,0.5372549019607843,0.4392156862745098,0.23529411764705882,0.49411764705882355,0.49019607843137253,0.6313725490196078,0.48627450980392156,0.36470588235294116,0.3843137254901961,0.3333333333333333,0.32941176470588235,0.8901960784313725,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6,0.34901960784313724,0.26666666666666666,0.5490196078431373,0.09019607843137255,0.3843137254901961,0.27058823529411763,0.396078431372549,0.5176470588235295,0.40784313725490196,0.17254901960784313,0.5529411764705883,0.4196078431372549,0.33725490196078434,0.5254901960784314,0.24705882352941178,0.4588235294117647,0.43137254901960786,0.21176470588235294,0.6549019607843137,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6,0.4666666666666667,0.3137254901960784,0.5176470588235295,0.25098039215686274,0.28627450980392155,0.3843137254901961,0.4392156862745098,0.4588235294117647,0.32941176470588235,0.25882352941176473,0.396078431372549,0.47058823529411764,0.3764705882352941,0.5176470588235295,0.34509803921568627,0.48627450980392156,0.15294117647058825,0.33725490196078434,0.6980392156862745,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.47058823529411764,0.48627450980392156,0.47843137254901963,0.3058823529411765,0.21176470588235294,0.36470588235294116,0.4666666666666667,0.23137254901960785,0.5254901960784314,0.27450980392156865,0.3254901960784314,0.45098039215686275,0.5568627450980392,0.4117647058823529,0.32941176470588235,0.4392156862745098,0.49411764705882355,0.19607843137254902,0.47058823529411764,0.6392156862745098,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5725490196078431,0.38823529411764707,0.5882352941176471,0.42745098039215684,0.21568627450980393,0.3333333333333333,0.43137254901960786,0.42745098039215684,0.5686274509803921,0.48627450980392156,0.19607843137254902,0.5098039215686274,0.43137254901960786,0.5372549019607843,0.4549019607843137,0.40784313725490196,0.3058823529411765,0.19215686274509805,0.34509803921568627,0.6509803921568628,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4392156862745098,0.4470588235294118,0.42745098039215684,0.3686274509803922,0.0,0.3686274509803922,0.25882352941176473,0.4196078431372549,0.4196078431372549,0.3568627450980392,0.06666666666666667,0.4470588235294118,0.48627450980392156,0.33725490196078434,0.5450980392156862,0.5098039215686274,0.09411764705882353,0.3137254901960784,0.4392156862745098,0.48627450980392156,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.3686274509803922,0.38823529411764707,0.3764705882352941,0.28627450980392155,0.16470588235294117,0.39215686274509803,0.22745098039215686,0.3137254901960784,0.5647058823529412,0.4745098039215686,0.14901960784313725,0.34509803921568627,0.37254901960784315,0.1568627450980392,0.40784313725490196,0.3764705882352941,0.19607843137254902,0.21568627450980393,0.3686274509803922,0.40784313725490196,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.39215686274509803,0.40784313725490196,0.5725490196078431,0.2196078431372549,0.13725490196078433,0.39215686274509803,0.36470588235294116,0.3058823529411765,0.5058823529411764,0.39215686274509803,0.18823529411764706,0.3058823529411765,0.40784313725490196,0.4666666666666667,0.45098039215686275,0.42745098039215684,0.2,0.09019607843137255,0.396078431372549,0.5568627450980392,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.43529411764705883,0.41568627450980394,0.5686274509803921,0.35294117647058826,0.13333333333333333,0.3843137254901961,0.47058823529411764,0.49019607843137253,0.5490196078431373,0.37254901960784315,0.1568627450980392,0.5372549019607843,0.5490196078431373,0.3843137254901961,0.4745098039215686,0.47058823529411764,0.13333333333333333,0.3137254901960784,0.5137254901960784,0.5176470588235295,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.37254901960784315,0.36470588235294116,0.4,0.30980392156862746,0.14901960784313725,0.3176470588235294,0.25098039215686274,0.3686274509803922,0.42745098039215684,0.3176470588235294,0.23529411764705882,0.3764705882352941,0.40784313725490196,0.20784313725490197,0.40784313725490196,0.4,0.12549019607843137,0.2549019607843137,0.36470588235294116,0.3764705882352941,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.40784313725490196,0.34509803921568627,0.4,0.396078431372549,0.17647058823529413,0.27058823529411763,0.32941176470588235,0.26666666666666666,0.3843137254901961,0.27058823529411763,0.23137254901960785,0.43137254901960786,0.45098039215686275,0.4588235294117647,0.27450980392156865,0.35294117647058826,0.19607843137254902,0.20784313725490197,0.47843137254901963,0.4196078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6980392156862745,0.29411764705882354,0.4392156862745098,0.9490196078431372,0.8941176470588236,0.4117647058823529,0.4117647058823529,0.2784313725490196,0.2901960784313726,0.33725490196078434,0.30980392156862746,0.3568627450980392,0.29411764705882354,0.5529411764705883,0.4117647058823529,0.792156862745098,0.6470588235294118,0.16470588235294117,0.34901960784313724,0.6196078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x23","yaxis":"y23","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"5","z":[[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.996078431372549,0.9725490196078431,1.0,1.0,0.2784313725490196,0.2235294117647059,0.26666666666666666,0.1803921568627451,0.4980392156862745,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,0.6235294117647059,0.00784313725490196,0.07058823529411765,0.06666666666666667,0.01568627450980392,0.0,0.9294117647058824,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,0.6549019607843137,0.27450980392156865,0.10980392156862745,0.16470588235294117,0.023529411764705882,0.011764705882352941,0.06274509803921569,0.2235294117647059,0.0784313725490196,0.3058823529411765,0.8117647058823529,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.44313725490196076,0.23137254901960785,0.1607843137254902,0.16862745098039217,0.20784313725490197,0.3058823529411765,0.12941176470588237,0.023529411764705882,0.23137254901960785,0.3176470588235294,0.21176470588235294,0.17254901960784313,0.17254901960784313,0.3686274509803922,0.9529411764705882,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.34509803921568627,0.13333333333333333,0.24313725490196078,0.26666666666666666,0.30980392156862746,0.23137254901960785,0.20784313725490197,0.2235294117647059,0.12941176470588237,0.2627450980392157,0.24705882352941178,0.25098039215686274,0.27058823529411763,0.27450980392156865,0.18823529411764706,0.24313725490196078,1.0,1.0,0.984313725490196,1.0,1.0,1.0],[1.0,1.0,1.0,0.9803921568627451,1.0,0.792156862745098,0.13333333333333333,0.28627450980392155,0.25098039215686274,0.23529411764705882,0.25882352941176473,0.24313725490196078,0.15294117647058825,0.1607843137254902,0.2627450980392157,0.16862745098039217,0.22745098039215686,0.23529411764705882,0.24705882352941178,0.2627450980392157,0.28627450980392155,0.16862745098039217,0.6039215686274509,1.0,0.9686274509803922,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.44313725490196076,0.1411764705882353,0.27450980392156865,0.25882352941176473,0.24705882352941178,0.25882352941176473,0.2627450980392157,0.30196078431372547,0.22745098039215686,0.2549019607843137,0.2784313725490196,0.26666666666666666,0.27450980392156865,0.27058823529411763,0.25098039215686274,0.27450980392156865,0.19607843137254902,0.33725490196078434,1.0,1.0,1.0,1.0,1.0],[1.0,0.996078431372549,1.0,1.0,1.0,0.3254901960784314,0.1568627450980392,0.2549019607843137,0.25882352941176473,0.25882352941176473,0.25882352941176473,0.2549019607843137,0.2627450980392157,0.2196078431372549,0.25882352941176473,0.23921568627450981,0.27058823529411763,0.28627450980392155,0.26666666666666666,0.27058823529411763,0.2980392156862745,0.20784313725490197,0.30196078431372547,1.0,1.0,0.996078431372549,1.0,1.0],[1.0,1.0,0.9921568627450981,1.0,1.0,0.23137254901960785,0.16862745098039217,0.21176470588235294,0.25098039215686274,0.25882352941176473,0.24313725490196078,0.23921568627450981,0.27450980392156865,0.2235294117647059,0.26666666666666666,0.25882352941176473,0.2549019607843137,0.2627450980392157,0.2627450980392157,0.25098039215686274,0.30196078431372547,0.2,0.21568627450980393,0.9372549019607843,1.0,0.9882352941176471,1.0,1.0],[1.0,1.0,0.9764705882352941,1.0,0.8352941176470589,0.17254901960784313,0.20784313725490197,0.19607843137254902,0.23921568627450981,0.23137254901960785,0.27058823529411763,0.20392156862745098,0.2235294117647059,0.2235294117647059,0.25882352941176473,0.27450980392156865,0.17647058823529413,0.24313725490196078,0.2627450980392157,0.2549019607843137,0.3058823529411765,0.1803921568627451,0.16470588235294117,0.7490196078431373,1.0,0.9921568627450981,1.0,1.0],[1.0,1.0,0.984313725490196,1.0,0.6313725490196078,0.16470588235294117,0.23137254901960785,0.2235294117647059,0.25098039215686274,0.20392156862745098,0.2,0.19215686274509805,0.2196078431372549,0.21568627450980393,0.24705882352941178,0.2627450980392157,0.1843137254901961,0.21568627450980393,0.22745098039215686,0.22745098039215686,0.3058823529411765,0.1803921568627451,0.16470588235294117,0.49411764705882355,1.0,1.0,1.0,1.0],[1.0,1.0,0.9882352941176471,1.0,0.39215686274509803,0.1803921568627451,0.23137254901960785,0.16862745098039217,0.1843137254901961,0.20784313725490197,0.16470588235294117,0.21568627450980393,0.25882352941176473,0.17647058823529413,0.27058823529411763,0.23529411764705882,0.25882352941176473,0.24313725490196078,0.22745098039215686,0.22745098039215686,0.2627450980392157,0.11764705882352941,0.1607843137254902,0.3058823529411765,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.21176470588235294,0.20392156862745098,0.20784313725490197,0.13333333333333333,0.25882352941176473,0.25098039215686274,0.26666666666666666,0.24705882352941178,0.2627450980392157,0.1803921568627451,0.27450980392156865,0.23921568627450981,0.23921568627450981,0.2784313725490196,0.2784313725490196,0.29411764705882354,0.3058823529411765,0.07450980392156863,0.1607843137254902,0.1803921568627451,1.0,1.0,0.9921568627450981,1.0],[1.0,0.9882352941176471,1.0,1.0,0.22745098039215686,0.23137254901960785,0.21568627450980393,0.07450980392156863,0.27450980392156865,0.23529411764705882,0.23921568627450981,0.23921568627450981,0.25098039215686274,0.17254901960784313,0.27058823529411763,0.22745098039215686,0.23921568627450981,0.26666666666666666,0.27450980392156865,0.2823529411764706,0.2823529411764706,0.07450980392156863,0.16862745098039217,0.21568627450980393,0.8235294117647058,1.0,0.9921568627450981,1.0],[1.0,0.9921568627450981,1.0,0.8156862745098039,0.16470588235294117,0.2549019607843137,0.22745098039215686,0.06274509803921569,0.24313725490196078,0.24705882352941178,0.22745098039215686,0.22745098039215686,0.2235294117647059,0.1607843137254902,0.27058823529411763,0.2235294117647059,0.24313725490196078,0.27058823529411763,0.2627450980392157,0.25098039215686274,0.2823529411764706,0.0784313725490196,0.1607843137254902,0.20392156862745098,0.5882352941176471,1.0,1.0,1.0],[1.0,0.9921568627450981,1.0,0.611764705882353,0.16470588235294117,0.2549019607843137,0.2235294117647059,0.03529411764705882,0.09803921568627451,0.22745098039215686,0.23529411764705882,0.25098039215686274,0.27058823529411763,0.17254901960784313,0.2627450980392157,0.21176470588235294,0.23921568627450981,0.30196078431372547,0.24705882352941178,0.14901960784313725,0.22745098039215686,0.06666666666666667,0.1607843137254902,0.23137254901960785,0.30980392156862746,1.0,1.0,1.0],[1.0,1.0,1.0,0.3607843137254902,0.1843137254901961,0.2823529411764706,0.20784313725490197,0.043137254901960784,0.047058823529411764,0.19215686274509805,0.21568627450980393,0.24705882352941178,0.25882352941176473,0.1411764705882353,0.27058823529411763,0.17647058823529413,0.2549019607843137,0.27450980392156865,0.15294117647058825,0.2196078431372549,0.2235294117647059,0.0,0.17647058823529413,0.25882352941176473,0.17647058823529413,1.0,1.0,1.0],[1.0,1.0,1.0,0.2196078431372549,0.2235294117647059,0.24705882352941178,0.2,0.050980392156862744,0.047058823529411764,0.23921568627450981,0.16470588235294117,0.24313725490196078,0.2549019607843137,0.13333333333333333,0.28627450980392155,0.1843137254901961,0.2549019607843137,0.22745098039215686,0.1803921568627451,0.27450980392156865,0.15294117647058825,0.00784313725490196,0.18823529411764706,0.27450980392156865,0.24705882352941178,0.9450980392156862,1.0,1.0],[1.0,1.0,0.984313725490196,0.23529411764705882,0.25882352941176473,0.2235294117647059,0.2196078431372549,0.0392156862745098,0.047058823529411764,0.32941176470588235,0.1843137254901961,0.22745098039215686,0.2549019607843137,0.12549019607843137,0.29411764705882354,0.19215686274509805,0.23137254901960785,0.25098039215686274,0.2784313725490196,0.2627450980392157,0.06274509803921569,0.08627450980392157,0.1607843137254902,0.2901960784313726,0.20784313725490197,0.7803921568627451,1.0,1.0],[1.0,1.0,0.8392156862745098,0.20784313725490197,0.27450980392156865,0.2235294117647059,0.20784313725490197,0.0,0.13725490196078433,0.3176470588235294,0.24705882352941178,0.23529411764705882,0.23529411764705882,0.15294117647058825,0.29411764705882354,0.19607843137254902,0.21176470588235294,0.25098039215686274,0.25882352941176473,0.2627450980392157,0.027450980392156862,0.1450980392156863,0.15294117647058825,0.2901960784313726,0.20784313725490197,0.5254901960784314,1.0,1.0],[1.0,1.0,0.6823529411764706,0.18823529411764706,0.2784313725490196,0.20784313725490197,0.20392156862745098,0.0,0.2627450980392157,0.29411764705882354,0.2784313725490196,0.25098039215686274,0.21568627450980393,0.13725490196078433,0.27450980392156865,0.21176470588235294,0.1607843137254902,0.2627450980392157,0.2627450980392157,0.2901960784313726,0.09019607843137255,0.1568627450980392,0.16470588235294117,0.29411764705882354,0.24705882352941178,0.33725490196078434,1.0,1.0],[1.0,1.0,0.5333333333333333,0.19215686274509805,0.27450980392156865,0.23529411764705882,0.14901960784313725,0.058823529411764705,0.40784313725490196,0.23137254901960785,0.2784313725490196,0.23921568627450981,0.18823529411764706,0.10980392156862745,0.2627450980392157,0.2235294117647059,0.17254901960784313,0.2196078431372549,0.2235294117647059,0.30980392156862746,0.2196078431372549,0.06666666666666667,0.22745098039215686,0.27450980392156865,0.2823529411764706,0.21176470588235294,1.0,1.0],[1.0,1.0,0.32941176470588235,0.2196078431372549,0.30980392156862746,0.27058823529411763,0.01568627450980392,0.21176470588235294,0.5176470588235295,0.16470588235294117,0.25882352941176473,0.23921568627450981,0.19215686274509805,0.1411764705882353,0.2901960784313726,0.21568627450980393,0.21568627450980393,0.25098039215686274,0.2784313725490196,0.24705882352941178,0.5764705882352941,0.01568627450980392,0.21176470588235294,0.2549019607843137,0.3137254901960784,0.14901960784313725,0.996078431372549,1.0],[1.0,1.0,0.30980392156862746,0.22745098039215686,0.27058823529411763,0.23529411764705882,0.0,0.47058823529411764,0.5098039215686274,0.17254901960784313,0.27450980392156865,0.2549019607843137,0.22745098039215686,0.12156862745098039,0.2823529411764706,0.2196078431372549,0.2196078431372549,0.28627450980392155,0.3137254901960784,0.16862745098039217,0.6941176470588235,0.17647058823529413,0.1568627450980392,0.24313725490196078,0.3176470588235294,0.11764705882352941,0.7764705882352941,1.0],[1.0,0.9686274509803922,0.2784313725490196,0.26666666666666666,0.27058823529411763,0.22745098039215686,0.058823529411764705,0.7529411764705882,0.396078431372549,0.17254901960784313,0.2823529411764706,0.2784313725490196,0.19607843137254902,0.11372549019607843,0.2784313725490196,0.23137254901960785,0.2196078431372549,0.27450980392156865,0.3215686274509804,0.16862745098039217,0.5882352941176471,0.23921568627450981,0.12156862745098039,0.2784313725490196,0.3333333333333333,0.3058823529411765,0.596078431372549,1.0],[1.0,0.7686274509803922,0.21568627450980393,0.19607843137254902,0.17647058823529413,0.1568627450980392,0.0,0.803921568627451,0.2549019607843137,0.2901960784313726,0.3058823529411765,0.2901960784313726,0.18823529411764706,0.13725490196078433,0.29411764705882354,0.25882352941176473,0.23529411764705882,0.26666666666666666,0.2980392156862745,0.2549019607843137,0.3607843137254902,0.3607843137254902,0.0,0.2235294117647059,0.25882352941176473,0.2549019607843137,0.4235294117647059,1.0],[1.0,1.0,0.9176470588235294,0.7647058823529411,0.5725490196078431,0.3411764705882353,0.2823529411764706,0.9176470588235294,0.027450980392156862,0.10196078431372549,0.1568627450980392,0.1450980392156863,0.06274509803921569,0.03137254901960784,0.20784313725490197,0.2196078431372549,0.1411764705882353,0.11764705882352941,0.12549019607843137,0.011764705882352941,0.37254901960784315,1.0,0.20784313725490197,0.34901960784313724,0.6588235294117647,0.807843137254902,0.984313725490196,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8156862745098039,0.5647058823529412,0.49411764705882355,0.4117647058823529,0.38823529411764707,0.34901960784313724,0.36470588235294116,0.396078431372549,0.44313725490196076,0.5098039215686274,0.6941176470588235,0.8470588235294118,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x24","yaxis":"y24","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"6","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9098039215686274,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9215686274509803,0.4823529411764706,0.25882352941176473,0.4117647058823529,1.0,1.0,0.2823529411764706,0.34509803921568627,0.49019607843137253,0.9529411764705882,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7411764705882353,0.06274509803921569,0.1843137254901961,0.12549019607843137,0.0392156862745098,0.09019607843137255,0.15294117647058825,0.1607843137254902,0.07058823529411765,0.7333333333333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8274509803921568,0.1450980392156863,0.20392156862745098,0.21568627450980393,0.14901960784313725,0.19607843137254902,0.21176470588235294,0.21568627450980393,0.1450980392156863,0.7686274509803922,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8156862745098039,0.14901960784313725,0.23921568627450981,0.21568627450980393,0.23137254901960785,0.2196078431372549,0.2235294117647059,0.23921568627450981,0.14901960784313725,0.7490196078431373,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8274509803921568,0.17647058823529413,0.26666666666666666,0.22745098039215686,0.23137254901960785,0.21568627450980393,0.23529411764705882,0.25882352941176473,0.1803921568627451,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8588235294117647,0.17647058823529413,0.26666666666666666,0.24705882352941178,0.25098039215686274,0.2627450980392157,0.23137254901960785,0.2549019607843137,0.20784313725490197,0.9019607843137255,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,0.1411764705882353,0.1607843137254902,0.18823529411764706,0.07450980392156863,0.09411764705882353,0.15294117647058825,0.12941176470588237,0.17647058823529413,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.25882352941176473,0.403921568627451,0.5215686274509804,0.5254901960784314,0.5176470588235295,0.47843137254901963,0.25882352941176473,0.3058823529411765,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.09803921568627451,0.12941176470588237,0.09411764705882353,0.12941176470588237,0.1803921568627451,0.19215686274509805,0.23529411764705882,0.14901960784313725,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7411764705882353,0.09803921568627451,0.21176470588235294,0.16470588235294117,0.17647058823529413,0.1843137254901961,0.1803921568627451,0.19215686274509805,0.12549019607843137,0.7686274509803922,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4627450980392157,0.1450980392156863,0.24313725490196078,0.1607843137254902,0.22745098039215686,0.27058823529411763,0.2,0.23137254901960785,0.19215686274509805,0.49411764705882355,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2901960784313726,0.16470588235294117,0.2823529411764706,0.10196078431372549,0.21568627450980393,0.27058823529411763,0.1803921568627451,0.2,0.20784313725490197,0.3176470588235294,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.13333333333333333,0.19215686274509805,0.2901960784313726,0.0784313725490196,0.20784313725490197,0.27058823529411763,0.19607843137254902,0.17254901960784313,0.2196078431372549,0.15294117647058825,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.047058823529411764,0.20392156862745098,0.2901960784313726,0.058823529411764705,0.2,0.26666666666666666,0.20392156862745098,0.15294117647058825,0.22745098039215686,0.08235294117647059,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0196078431372549,0.20392156862745098,0.2823529411764706,0.050980392156862744,0.1843137254901961,0.27058823529411763,0.21176470588235294,0.1450980392156863,0.24705882352941178,0.058823529411764705,0.8980392156862745,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8862745098039215,0.00784313725490196,0.20392156862745098,0.27058823529411763,0.043137254901960784,0.16470588235294117,0.2784313725490196,0.2196078431372549,0.1450980392156863,0.2549019607843137,0.047058823529411764,0.7411764705882353,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7450980392156863,0.00784313725490196,0.2,0.2784313725490196,0.043137254901960784,0.1450980392156863,0.27058823529411763,0.22745098039215686,0.13725490196078433,0.2235294117647059,0.054901960784313725,0.6549019607843137,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6588235294117647,0.023529411764705882,0.18823529411764706,0.2823529411764706,0.050980392156862744,0.12549019607843137,0.25882352941176473,0.22745098039215686,0.14901960784313725,0.2,0.08235294117647059,0.592156862745098,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.615686274509804,0.047058823529411764,0.17254901960784313,0.26666666666666666,0.058823529411764705,0.10196078431372549,0.25882352941176473,0.2235294117647059,0.16470588235294117,0.1803921568627451,0.09803921568627451,0.49411764705882355,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5294117647058824,0.06274509803921569,0.15294117647058825,0.25882352941176473,0.06274509803921569,0.08627450980392157,0.25882352941176473,0.2196078431372549,0.17647058823529413,0.16470588235294117,0.12156862745098039,0.396078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4470588235294118,0.09019607843137255,0.14901960784313725,0.23921568627450981,0.07450980392156863,0.07450980392156863,0.2549019607843137,0.21176470588235294,0.17254901960784313,0.1450980392156863,0.1450980392156863,0.3333333333333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3843137254901961,0.10196078431372549,0.1411764705882353,0.23137254901960785,0.08627450980392157,0.047058823529411764,0.24705882352941178,0.20784313725490197,0.17647058823529413,0.12549019607843137,0.1568627450980392,0.27058823529411763,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.34901960784313724,0.16862745098039217,0.1568627450980392,0.23529411764705882,0.10980392156862745,0.0392156862745098,0.23921568627450981,0.20392156862745098,0.1843137254901961,0.15294117647058825,0.1803921568627451,0.26666666666666666,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.20784313725490197,0.10196078431372549,0.08235294117647059,0.16470588235294117,0.10196078431372549,0.03529411764705882,0.20784313725490197,0.20784313725490197,0.15294117647058825,0.08627450980392157,0.10588235294117647,0.1843137254901961,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.49019607843137253,0.40784313725490196,0.396078431372549,0.07450980392156863,0.06666666666666667,0.15294117647058825,0.0,0.00784313725490196,0.14901960784313725,0.29411764705882354,0.2784313725490196,0.39215686274509803,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7647058823529411,0.7725490196078432,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x13","yaxis":"y13","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"7","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7411764705882353,0.7058823529411765,1.0,1.0,1.0,1.0,1.0,0.8901960784313725,0.47843137254901963,0.9568627450980393,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.792156862745098,0.24705882352941178,0.40784313725490196,0.34901960784313724,0.050980392156862744,0.44313725490196076,0.5215686274509804,0.611764705882353,0.596078431372549,0.24313725490196078,0.4117647058823529,0.2627450980392157,0.8588235294117647,1.0,0.9764705882352941,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.43529411764705883,0.20392156862745098,0.33725490196078434,0.42745098039215684,0.4549019607843137,0.11372549019607843,0.050980392156862744,0.34509803921568627,0.4549019607843137,0.3176470588235294,0.40784313725490196,0.17254901960784313,0.3411764705882353,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.28627450980392155,0.2823529411764706,0.2901960784313726,0.24313725490196078,0.34509803921568627,0.24705882352941178,0.5019607843137255,0.43529411764705883,0.5764705882352941,0.16862745098039217,0.21568627450980393,0.12941176470588237,0.1803921568627451,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.21176470588235294,0.27058823529411763,0.20392156862745098,0.14901960784313725,0.1843137254901961,0.17647058823529413,0.3215686274509804,0.5529411764705883,0.28627450980392155,0.18823529411764706,1.0,0.1843137254901961,0.09411764705882353,0.9176470588235294,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.16470588235294117,0.3137254901960784,0.5333333333333333,0.28627450980392155,0.20392156862745098,0.10980392156862745,0.34509803921568627,0.7647058823529411,0.6392156862745098,0.11372549019607843,0.7372549019607844,0.1568627450980392,0.054901960784313725,0.7137254901960784,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.12549019607843137,0.34901960784313724,0.5098039215686274,0.23137254901960785,0.01568627450980392,0.23529411764705882,0.6352941176470588,0.8392156862745098,0.5725490196078431,0.03529411764705882,0.14901960784313725,0.11372549019607843,0.12156862745098039,0.5098039215686274,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.8666666666666667,0.12156862745098039,0.3764705882352941,0.30980392156862746,0.06666666666666667,0.6196078431372549,0.06666666666666667,0.38823529411764707,0.611764705882353,0.29411764705882354,0.23137254901960785,0.2549019607843137,0.13725490196078433,0.2235294117647059,0.5019607843137255,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.6901960784313725,0.10588235294117647,0.1450980392156863,0.4,0.047058823529411764,0.42745098039215684,0.03529411764705882,0.047058823529411764,0.07058823529411765,0.10980392156862745,0.396078431372549,0.2196078431372549,0.06666666666666667,0.10196078431372549,0.38823529411764707,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.6078431372549019,0.07450980392156863,0.403921568627451,0.3568627450980392,0.21568627450980393,1.0,0.3333333333333333,0.06274509803921569,0.23921568627450981,0.27450980392156865,0.4,0.5176470588235295,0.3058823529411765,0.10196078431372549,0.26666666666666666,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.5411764705882353,0.01568627450980392,0.6313725490196078,0.6627450980392157,0.1843137254901961,0.803921568627451,0.4745098039215686,0.2549019607843137,0.12156862745098039,0.011764705882352941,0.3215686274509804,0.6,0.6980392156862745,0.07058823529411765,0.1411764705882353,0.9372549019607843,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.34509803921568627,0.0196078431372549,0.43529411764705883,0.9058823529411765,0.17254901960784313,0.09803921568627451,0.027450980392156862,0.09019607843137255,0.13725490196078433,0.1843137254901961,0.23137254901960785,0.4,1.0,0.08627450980392157,0.07450980392156863,0.7764705882352941,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.36470588235294116,0.03529411764705882,0.3764705882352941,0.8823529411764706,0.5058823529411764,0.0196078431372549,0.12549019607843137,0.13333333333333333,0.13333333333333333,0.1607843137254902,0.10588235294117647,0.16862745098039217,0.9490196078431372,0.08235294117647059,0.10980392156862745,0.6,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.5568627450980392,0.0,0.3803921568627451,0.7764705882352941,0.41568627450980394,0.09411764705882353,0.1450980392156863,0.12941176470588237,0.0,0.1568627450980392,0.16862745098039217,0.5568627450980392,1.0,0.11764705882352941,0.054901960784313725,0.43529411764705883,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.24313725490196078,0.0,0.47843137254901963,0.7372549019607844,0.1411764705882353,0.11372549019607843,0.09803921568627451,0.48627450980392156,0.25882352941176473,0.8823529411764706,0.28627450980392155,0.023529411764705882,0.8352941176470589,0.38823529411764707,0.1411764705882353,0.30196078431372547,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.396078431372549,0.0392156862745098,0.27058823529411763,0.32941176470588235,0.01568627450980392,0.13333333333333333,0.0,0.788235294117647,0.3176470588235294,0.35294117647058826,0.1607843137254902,0.0,0.6588235294117647,0.2627450980392157,0.34901960784313724,0.6470588235294118,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.2627450980392157,0.07058823529411765,0.20392156862745098,0.34901960784313724,0.20392156862745098,0.27450980392156865,0.027450980392156862,0.10980392156862745,0.37254901960784315,0.0392156862745098,0.0784313725490196,0.03529411764705882,0.6862745098039216,0.38823529411764707,0.011764705882352941,0.3254901960784314,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.2196078431372549,0.023529411764705882,0.32941176470588235,0.3803921568627451,0.06666666666666667,0.30196078431372547,0.4823529411764706,0.17647058823529413,0.9098039215686274,0.3176470588235294,0.11764705882352941,0.2196078431372549,0.36470588235294116,0.3411764705882353,0.00392156862745098,0.32941176470588235,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.25882352941176473,0.06666666666666667,0.2823529411764706,0.01568627450980392,0.29411764705882354,0.23921568627450981,0.30196078431372547,0.10196078431372549,0.00392156862745098,0.14901960784313725,0.14901960784313725,0.6941176470588235,0.5294117647058824,0.4392156862745098,0.09411764705882353,0.4980392156862745,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.20392156862745098,0.3137254901960784,0.4196078431372549,0.16862745098039217,0.10980392156862745,0.07058823529411765,0.10980392156862745,0.1450980392156863,0.2549019607843137,0.8549019607843137,0.27058823529411763,0.9529411764705882,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.058823529411764705,0.17647058823529413,0.17254901960784313,0.0784313725490196,0.09019607843137255,0.11372549019607843,0.07450980392156863,0.16470588235294117,0.058823529411764705,0.5725490196078431,0.13725490196078433,0.8941176470588236,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7333333333333333,0.26666666666666666,0.09803921568627451,0.09803921568627451,0.08235294117647059,0.09803921568627451,0.08235294117647059,0.09411764705882353,0.054901960784313725,0.09411764705882353,0.06274509803921569,0.8549019607843137,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.27450980392156865,0.1411764705882353,0.19215686274509805,0.14901960784313725,0.06666666666666667,0.11372549019607843,0.03529411764705882,0.2980392156862745,0.4549019607843137,0.19607843137254902,0.1607843137254902,0.788235294117647,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.11764705882352941,0.2235294117647059,0.18823529411764706,0.058823529411764705,0.054901960784313725,0.10196078431372549,0.0,0.41568627450980394,1.0,0.16862745098039217,0.09019607843137255,0.7607843137254902,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8117647058823529,0.47058823529411764,0.5411764705882353,0.06274509803921569,0.23921568627450981,0.03529411764705882,0.10196078431372549,0.17254901960784313,0.2196078431372549,0.5254901960784314,0.1411764705882353,0.1450980392156863,0.6274509803921569,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,0.30196078431372547,0.12549019607843137,0.12941176470588237,0.984313725490196,0.3333333333333333,0.0,0.23529411764705882,0.1607843137254902,0.054901960784313725,0.1607843137254902,0.2784313725490196,0.5803921568627451,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9019607843137255,0.1803921568627451,0.0196078431372549,0.06274509803921569,0.0,0.26666666666666666,0.4392156862745098,0.10980392156862745,0.08235294117647059,0.07450980392156863,0.17254901960784313,0.24705882352941178,0.27058823529411763,1.0,0.9725490196078431,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7450980392156863,0.4823529411764706,0.4,0.47843137254901963,0.7568627450980392,0.8901960784313725,0.7294117647058823,0.4470588235294118,0.4196078431372549,0.5333333333333333,0.6235294117647059,0.7098039215686275,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x14","yaxis":"y14","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"8","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8431372549019608,0.30980392156862746,0.45098039215686275,0.7372549019607844,0.8117647058823529,0.8352941176470589,0.807843137254902,0.6941176470588235,0.5607843137254902,0.2627450980392157,0.5176470588235295,0.984313725490196,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.6862745098039216,0.20784313725490197,0.058823529411764705,0.06666666666666667,0.12156862745098039,0.09411764705882353,0.0,0.0,0.0,0.0,0.12156862745098039,0.10980392156862745,0.043137254901960784,0.09019607843137255,0.20784313725490197,0.8627450980392157,1.0,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.9803921568627451,1.0,0.6784313725490196,0.1803921568627451,0.09411764705882353,0.07058823529411765,0.09019607843137255,0.13725490196078433,0.23921568627450981,0.23529411764705882,0.18823529411764706,0.17647058823529413,0.25882352941176473,0.1843137254901961,0.09803921568627451,0.09411764705882353,0.047058823529411764,0.4196078431372549,0.32941176470588235,0.8549019607843137,1.0,0.9882352941176471,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.1843137254901961,0.3058823529411765,0.2196078431372549,0.15294117647058825,0.09019607843137255,0.09019607843137255,0.06666666666666667,0.17647058823529413,0.19607843137254902,0.17254901960784313,0.10196078431372549,0.06666666666666667,0.07450980392156863,0.13333333333333333,0.2196078431372549,0.23137254901960785,0.15294117647058825,0.3843137254901961,1.0,0.9803921568627451,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7568627450980392,0.06666666666666667,0.08627450980392157,0.1411764705882353,0.4549019607843137,0.27058823529411763,0.17254901960784313,0.07058823529411765,0.054901960784313725,0.054901960784313725,0.058823529411764705,0.06666666666666667,0.11372549019607843,0.21568627450980393,0.4549019607843137,0.3254901960784314,0.10196078431372549,0.16470588235294117,0.17647058823529413,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5098039215686274,0.011764705882352941,0.34901960784313724,0.2784313725490196,0.09019607843137255,0.09411764705882353,0.09411764705882353,0.07450980392156863,0.10196078431372549,0.10196078431372549,0.09803921568627451,0.08627450980392157,0.058823529411764705,0.14901960784313725,0.06666666666666667,0.17254901960784313,0.2,0.22745098039215686,0.058823529411764705,0.9254901960784314,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.43137254901960786,0.0,0.25882352941176473,0.34901960784313724,0.14901960784313725,0.058823529411764705,0.07058823529411765,0.07058823529411765,0.08235294117647059,0.09019607843137255,0.09019607843137255,0.08235294117647059,0.09019607843137255,0.03529411764705882,0.12549019607843137,0.25098039215686274,0.34901960784313724,0.12941176470588237,0.0196078431372549,0.6313725490196078,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.36470588235294116,0.03137254901960784,0.06274509803921569,0.00392156862745098,0.058823529411764705,0.09019607843137255,0.09019607843137255,0.09803921568627451,0.10588235294117647,0.09411764705882353,0.09411764705882353,0.09019607843137255,0.09803921568627451,0.10588235294117647,0.09019607843137255,0.06274509803921569,0.01568627450980392,0.09803921568627451,0.03137254901960784,0.3764705882352941,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.2901960784313726,0.03137254901960784,0.09411764705882353,0.10196078431372549,0.08235294117647059,0.0784313725490196,0.09019607843137255,0.10588235294117647,0.08627450980392157,0.09803921568627451,0.09411764705882353,0.09411764705882353,0.09411764705882353,0.09411764705882353,0.09803921568627451,0.10588235294117647,0.09019607843137255,0.08235294117647059,0.03529411764705882,0.29411764705882354,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.23137254901960785,0.047058823529411764,0.08235294117647059,0.09411764705882353,0.07058823529411765,0.08235294117647059,0.08235294117647059,0.08235294117647059,0.09019607843137255,0.09411764705882353,0.09411764705882353,0.08627450980392157,0.08235294117647059,0.09803921568627451,0.09411764705882353,0.09803921568627451,0.0784313725490196,0.08235294117647059,0.058823529411764705,0.19215686274509805,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.16862745098039217,0.054901960784313725,0.07058823529411765,0.09019607843137255,0.06274509803921569,0.08627450980392157,0.0784313725490196,0.08235294117647059,0.08627450980392157,0.09019607843137255,0.08627450980392157,0.08235294117647059,0.0784313725490196,0.09411764705882353,0.09411764705882353,0.09803921568627451,0.09019607843137255,0.0784313725490196,0.07058823529411765,0.1450980392156863,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.11372549019607843,0.06666666666666667,0.06274509803921569,0.09019607843137255,0.050980392156862744,0.09019607843137255,0.0784313725490196,0.0784313725490196,0.08627450980392157,0.09019607843137255,0.08627450980392157,0.08235294117647059,0.0784313725490196,0.09803921568627451,0.09411764705882353,0.10196078431372549,0.07450980392156863,0.0784313725490196,0.0784313725490196,0.10980392156862745,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.9725490196078431,0.08235294117647059,0.07058823529411765,0.054901960784313725,0.09019607843137255,0.047058823529411764,0.08627450980392157,0.0784313725490196,0.06666666666666667,0.0784313725490196,0.08627450980392157,0.08627450980392157,0.0784313725490196,0.06274509803921569,0.08235294117647059,0.09411764705882353,0.08235294117647059,0.09019607843137255,0.06666666666666667,0.07450980392156863,0.07058823529411765,0.9372549019607843,1.0,1.0,1.0],[1.0,1.0,1.0,0.8392156862745098,0.06666666666666667,0.06666666666666667,0.0392156862745098,0.0784313725490196,0.058823529411764705,0.08627450980392157,0.07450980392156863,0.06274509803921569,0.07450980392156863,0.08235294117647059,0.08235294117647059,0.0784313725490196,0.058823529411764705,0.0784313725490196,0.09803921568627451,0.0784313725490196,0.09019607843137255,0.058823529411764705,0.0784313725490196,0.047058823529411764,0.7803921568627451,1.0,1.0,1.0],[1.0,1.0,1.0,0.7098039215686275,0.047058823529411764,0.06666666666666667,0.054901960784313725,0.07058823529411765,0.043137254901960784,0.08235294117647059,0.07450980392156863,0.06274509803921569,0.07450980392156863,0.0784313725490196,0.0784313725490196,0.0784313725490196,0.058823529411764705,0.07058823529411765,0.10196078431372549,0.07058823529411765,0.08235294117647059,0.054901960784313725,0.07058823529411765,0.023529411764705882,0.6549019607843137,1.0,1.0,1.0],[1.0,1.0,1.0,0.5882352941176471,0.0392156862745098,0.07058823529411765,0.28627450980392155,0.0784313725490196,0.043137254901960784,0.08627450980392157,0.07058823529411765,0.058823529411764705,0.06666666666666667,0.0784313725490196,0.0784313725490196,0.07450980392156863,0.054901960784313725,0.06666666666666667,0.10588235294117647,0.06274509803921569,0.10588235294117647,0.16862745098039217,0.06666666666666667,0.03529411764705882,0.5529411764705883,1.0,1.0,1.0],[1.0,1.0,1.0,0.43137254901960786,0.047058823529411764,0.06666666666666667,0.13333333333333333,0.058823529411764705,0.047058823529411764,0.08235294117647059,0.07058823529411765,0.058823529411764705,0.06666666666666667,0.08235294117647059,0.0784313725490196,0.07058823529411765,0.050980392156862744,0.058823529411764705,0.10196078431372549,0.07450980392156863,0.09411764705882353,0.1450980392156863,0.06274509803921569,0.0392156862745098,0.4117647058823529,1.0,1.0,1.0],[1.0,1.0,1.0,0.35294117647058826,0.050980392156862744,0.058823529411764705,0.027450980392156862,0.07058823529411765,0.043137254901960784,0.08235294117647059,0.06666666666666667,0.054901960784313725,0.06274509803921569,0.08235294117647059,0.0784313725490196,0.07058823529411765,0.050980392156862744,0.050980392156862744,0.09411764705882353,0.07058823529411765,0.0784313725490196,0.07450980392156863,0.058823529411764705,0.043137254901960784,0.3254901960784314,1.0,1.0,1.0],[1.0,1.0,1.0,0.29411764705882354,0.00392156862745098,0.06274509803921569,0.06274509803921569,0.06274509803921569,0.054901960784313725,0.0784313725490196,0.06274509803921569,0.054901960784313725,0.058823529411764705,0.08235294117647059,0.08235294117647059,0.07058823529411765,0.050980392156862744,0.043137254901960784,0.09019607843137255,0.06666666666666667,0.08235294117647059,0.047058823529411764,0.058823529411764705,0.00392156862745098,0.27450980392156865,1.0,1.0,1.0],[1.0,1.0,1.0,0.2627450980392157,0.023529411764705882,0.06666666666666667,0.054901960784313725,0.06274509803921569,0.06274509803921569,0.07450980392156863,0.06666666666666667,0.047058823529411764,0.054901960784313725,0.07450980392156863,0.07058823529411765,0.07058823529411765,0.058823529411764705,0.043137254901960784,0.08235294117647059,0.07058823529411765,0.07450980392156863,0.06666666666666667,0.06666666666666667,0.011764705882352941,0.23921568627450981,1.0,1.0,1.0],[1.0,1.0,1.0,0.26666666666666666,0.027450980392156862,0.06666666666666667,0.054901960784313725,0.058823529411764705,0.06666666666666667,0.07450980392156863,0.07058823529411765,0.047058823529411764,0.054901960784313725,0.07450980392156863,0.07058823529411765,0.07058823529411765,0.058823529411764705,0.0392156862745098,0.0784313725490196,0.08235294117647059,0.06666666666666667,0.07058823529411765,0.06666666666666667,0.00392156862745098,0.2549019607843137,1.0,1.0,1.0],[1.0,1.0,1.0,0.2627450980392157,0.023529411764705882,0.07058823529411765,0.054901960784313725,0.050980392156862744,0.07450980392156863,0.06666666666666667,0.07058823529411765,0.047058823529411764,0.058823529411764705,0.07450980392156863,0.07058823529411765,0.07058823529411765,0.06274509803921569,0.03529411764705882,0.0784313725490196,0.08235294117647059,0.058823529411764705,0.0784313725490196,0.07450980392156863,0.0,0.27058823529411763,1.0,1.0,1.0],[1.0,1.0,1.0,0.27450980392156865,0.023529411764705882,0.06666666666666667,0.058823529411764705,0.050980392156862744,0.0784313725490196,0.06274509803921569,0.06666666666666667,0.047058823529411764,0.054901960784313725,0.07058823529411765,0.06274509803921569,0.06666666666666667,0.050980392156862744,0.027450980392156862,0.0784313725490196,0.09803921568627451,0.058823529411764705,0.07450980392156863,0.08235294117647059,0.0,0.29411764705882354,1.0,1.0,1.0],[1.0,1.0,1.0,0.2823529411764706,0.06666666666666667,0.07058823529411765,0.047058823529411764,0.06274509803921569,0.0784313725490196,0.08235294117647059,0.08627450980392157,0.06666666666666667,0.07058823529411765,0.08627450980392157,0.08627450980392157,0.09019607843137255,0.07450980392156863,0.06274509803921569,0.07450980392156863,0.08627450980392157,0.07058823529411765,0.07450980392156863,0.06666666666666667,0.0392156862745098,0.32941176470588235,1.0,1.0,1.0],[1.0,1.0,1.0,0.2784313725490196,0.06274509803921569,0.08235294117647059,0.0784313725490196,0.06666666666666667,0.06666666666666667,0.0392156862745098,0.01568627450980392,0.0,0.0,0.00392156862745098,0.00784313725490196,0.011764705882352941,0.00392156862745098,0.00392156862745098,0.043137254901960784,0.08235294117647059,0.0784313725490196,0.047058823529411764,0.06666666666666667,0.0392156862745098,0.32941176470588235,1.0,1.0,1.0],[1.0,1.0,1.0,0.3215686274509804,0.054901960784313725,0.12941176470588237,0.37254901960784315,0.0,0.10588235294117647,0.24313725490196078,0.4745098039215686,0.6745098039215687,0.6,0.5137254901960784,0.5333333333333333,0.5450980392156862,0.5254901960784314,0.28627450980392155,0.1607843137254902,0.08235294117647059,0.0392156862745098,0.1843137254901961,0.06274509803921569,0.050980392156862744,0.3411764705882353,1.0,1.0,1.0],[1.0,1.0,1.0,0.3803921568627451,0.011764705882352941,0.07450980392156863,0.9058823529411765,0.7725490196078432,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9215686274509803,0.7058823529411765,0.6627450980392157,0.0,0.00784313725490196,0.4392156862745098,1.0,1.0,1.0],[1.0,1.0,1.0,0.6941176470588235,0.16862745098039217,0.22745098039215686,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.996078431372549,0.996078431372549,1.0,1.0,1.0,1.0,0.8784313725490196,0.24313725490196078,0.21176470588235294,0.807843137254902,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x15","yaxis":"y15","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"9","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8,0.5490196078431373,0.6980392156862745,0.8862745098039215,0.9019607843137255,0.9058823529411765,0.6509803921568628,0.4235294117647059,0.9098039215686274,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9568627450980393,0.7333333333333333,0.5294117647058824,0.4823529411764706,0.5333333333333333,0.25098039215686274,0.047058823529411764,0.0784313725490196,0.047058823529411764,0.2549019607843137,0.4823529411764706,0.47843137254901963,0.5254901960784314,0.792156862745098,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.8431372549019608,0.4196078431372549,0.5215686274509804,0.5254901960784314,0.4823529411764706,0.5098039215686274,0.6666666666666666,0.6823529411764706,0.6941176470588235,0.7019607843137254,0.6823529411764706,0.5215686274509804,0.4980392156862745,0.5058823529411764,0.47058823529411764,0.5137254901960784,1.0,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.4745098039215686,0.5411764705882353,0.5450980392156862,0.5058823529411764,0.5058823529411764,0.4823529411764706,0.47058823529411764,0.49411764705882355,0.47058823529411764,0.49019607843137253,0.47058823529411764,0.5215686274509804,0.5411764705882353,0.5254901960784314,0.5254901960784314,0.47843137254901963,0.7019607843137254,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9294117647058824,0.4745098039215686,0.5450980392156862,0.5450980392156862,0.5058823529411764,0.5254901960784314,0.5137254901960784,0.5254901960784314,0.5137254901960784,0.5411764705882353,0.5098039215686274,0.5294117647058824,0.5333333333333333,0.5294117647058824,0.5568627450980392,0.5568627450980392,0.5215686274509804,0.5803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.8705882352941177,0.403921568627451,0.4392156862745098,0.5137254901960784,0.5137254901960784,0.5411764705882353,0.5294117647058824,0.5411764705882353,0.5215686274509804,0.5490196078431373,0.5294117647058824,0.5450980392156862,0.5490196078431373,0.5490196078431373,0.5686274509803921,0.5450980392156862,0.5450980392156862,0.5294117647058824,0.9803921568627451,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.7843137254901961,0.34901960784313724,0.3176470588235294,0.6470588235294118,0.5333333333333333,0.5098039215686274,0.5254901960784314,0.49019607843137253,0.4823529411764706,0.4549019607843137,0.5333333333333333,0.5137254901960784,0.44313725490196076,0.49411764705882355,0.6431372549019608,0.5215686274509804,0.5450980392156862,0.43529411764705883,0.9372549019607843,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6862745098039216,0.3411764705882353,0.2196078431372549,0.5764705882352941,0.47843137254901963,0.6313725490196078,0.5098039215686274,0.4980392156862745,0.8,0.3843137254901961,0.6823529411764706,0.592156862745098,0.4627450980392157,0.6941176470588235,0.5450980392156862,0.5607843137254902,0.5137254901960784,0.4823529411764706,0.788235294117647,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.5843137254901961,0.38823529411764707,0.07450980392156863,0.39215686274509803,0.49411764705882355,0.5098039215686274,0.47843137254901963,0.4588235294117647,0.5647058823529412,0.44313725490196076,0.47843137254901963,0.4588235294117647,0.4549019607843137,0.5764705882352941,0.611764705882353,0.5450980392156862,0.4117647058823529,0.49411764705882355,0.6980392156862745,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.49019607843137253,0.4588235294117647,0.058823529411764705,0.2196078431372549,0.4627450980392157,0.47058823529411764,0.4745098039215686,0.4980392156862745,0.4745098039215686,0.5215686274509804,0.4823529411764706,0.47843137254901963,0.4745098039215686,0.4823529411764706,0.6078431372549019,0.6078431372549019,0.3843137254901961,0.47058823529411764,0.6078431372549019,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.42745098039215684,0.5843137254901961,0.1450980392156863,0.09803921568627451,0.4549019607843137,0.47058823529411764,0.47843137254901963,0.49019607843137253,0.49411764705882355,0.49019607843137253,0.49411764705882355,0.4823529411764706,0.4470588235294118,0.5490196078431373,0.5294117647058824,0.5686274509803921,0.3568627450980392,0.4588235294117647,0.5215686274509804,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9764705882352941,0.39215686274509803,0.5803921568627451,0.2235294117647059,0.13725490196078433,0.39215686274509803,0.49019607843137253,0.47843137254901963,0.4980392156862745,0.49411764705882355,0.49411764705882355,0.4823529411764706,0.5058823529411764,0.4823529411764706,0.43529411764705883,0.5490196078431373,0.3254901960784314,0.25098039215686274,0.47843137254901963,0.4549019607843137,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9137254901960784,0.3764705882352941,0.5647058823529412,0.24705882352941178,0.09803921568627451,0.403921568627451,0.4823529411764706,0.4549019607843137,0.4980392156862745,0.4980392156862745,0.4745098039215686,0.4823529411764706,0.4745098039215686,0.5803921568627451,0.4117647058823529,0.5803921568627451,0.23137254901960785,0.1843137254901961,0.4588235294117647,0.396078431372549,0.9411764705882353,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.8549019607843137,0.3843137254901961,0.5450980392156862,0.1607843137254902,0.0,0.4823529411764706,0.4588235294117647,0.47058823529411764,0.4823529411764706,0.47058823529411764,0.4745098039215686,0.4745098039215686,0.42745098039215684,0.6078431372549019,0.43529411764705883,0.5294117647058824,0.3058823529411765,0.17647058823529413,0.4627450980392157,0.39215686274509803,0.8745098039215686,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.788235294117647,0.3764705882352941,0.5490196078431373,0.1607843137254902,0.19607843137254902,0.4823529411764706,0.43529411764705883,0.4745098039215686,0.4745098039215686,0.4823529411764706,0.47843137254901963,0.4745098039215686,0.42745098039215684,0.5490196078431373,0.5411764705882353,0.42745098039215684,0.7294117647058823,0.18823529411764706,0.4392156862745098,0.38823529411764707,0.8196078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6980392156862745,0.37254901960784315,0.4745098039215686,0.14901960784313725,0.8862745098039215,0.3843137254901961,0.4588235294117647,0.4588235294117647,0.4627450980392157,0.4745098039215686,0.47058823529411764,0.4588235294117647,0.4470588235294118,0.47058823529411764,0.6627450980392157,0.3686274509803922,0.8588235294117647,0.12941176470588237,0.40784313725490196,0.3568627450980392,0.7372549019607844,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.611764705882353,0.4549019607843137,0.3411764705882353,0.396078431372549,0.8705882352941177,0.2901960784313726,0.49411764705882355,0.4549019607843137,0.4549019607843137,0.4549019607843137,0.4549019607843137,0.4470588235294118,0.44313725490196076,0.3843137254901961,0.6313725490196078,0.4588235294117647,0.8235294117647058,0.3686274509803922,0.3333333333333333,0.403921568627451,0.6980392156862745,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5647058823529412,0.42745098039215684,0.23921568627450981,0.5803921568627451,0.6,0.39215686274509803,0.49019607843137253,0.4470588235294118,0.4588235294117647,0.4588235294117647,0.47843137254901963,0.47843137254901963,0.4470588235294118,0.396078431372549,0.4980392156862745,0.5686274509803921,0.7019607843137254,0.6784313725490196,0.2823529411764706,0.4235294117647059,0.6823529411764706,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5333333333333333,0.42745098039215684,0.24705882352941178,0.7058823529411765,0.4196078431372549,0.4549019607843137,0.4588235294117647,0.47058823529411764,0.4588235294117647,0.4588235294117647,0.4549019607843137,0.4392156862745098,0.4470588235294118,0.44313725490196076,0.43529411764705883,0.5843137254901961,0.6862745098039216,0.6862745098039216,0.2901960784313726,0.396078431372549,0.6470588235294118,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5607843137254902,0.4196078431372549,0.2901960784313726,0.7176470588235294,0.34901960784313724,0.4823529411764706,0.44313725490196076,0.4392156862745098,0.4745098039215686,0.4588235294117647,0.4392156862745098,0.4470588235294118,0.44313725490196076,0.4392156862745098,0.37254901960784315,0.5137254901960784,0.6627450980392157,0.7411764705882353,0.28627450980392155,0.403921568627451,0.6549019607843137,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5843137254901961,0.3843137254901961,0.3215686274509804,0.6823529411764706,0.30196078431372547,0.4549019607843137,0.4235294117647059,0.4196078431372549,0.43529411764705883,0.42745098039215684,0.44313725490196076,0.4392156862745098,0.4235294117647059,0.4549019607843137,0.44313725490196076,0.42745098039215684,0.5490196078431373,0.7568627450980392,0.30980392156862746,0.40784313725490196,0.6627450980392157,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6196078431372549,0.3568627450980392,0.3215686274509804,0.8431372549019608,0.35294117647058826,0.49019607843137253,0.4980392156862745,0.5294117647058824,0.5411764705882353,0.5686274509803921,0.5607843137254902,0.5647058823529412,0.592156862745098,0.5450980392156862,0.5215686274509804,0.5450980392156862,0.5764705882352941,0.9568627450980393,0.3411764705882353,0.3843137254901961,0.6549019607843137,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6352941176470588,0.33725490196078434,0.3411764705882353,0.803921568627451,0.3176470588235294,0.4235294117647059,0.40784313725490196,0.403921568627451,0.4392156862745098,0.4549019607843137,0.4588235294117647,0.4745098039215686,0.4745098039215686,0.49019607843137253,0.4588235294117647,0.4980392156862745,0.49019607843137253,0.9882352941176471,0.37254901960784315,0.3686274509803922,0.6666666666666666,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6666666666666666,0.39215686274509803,0.30196078431372547,0.996078431372549,1.0,0.9647058823529412,0.9450980392156862,0.9568627450980393,0.9607843137254902,0.9647058823529412,0.9647058823529412,0.9647058823529412,0.9529411764705882,0.9529411764705882,0.9568627450980393,0.9568627450980393,1.0,1.0,0.3843137254901961,0.3686274509803922,0.7019607843137254,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6823529411764706,0.396078431372549,0.3686274509803922,0.9215686274509803,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.39215686274509803,0.3764705882352941,0.6666666666666666,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5686274509803921,0.3843137254901961,0.3843137254901961,0.8549019607843137,1.0,0.9803921568627451,0.996078431372549,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9882352941176471,0.9921568627450981,0.9921568627450981,0.996078431372549,0.9921568627450981,1.0,0.9921568627450981,0.38823529411764707,0.38823529411764707,0.6352941176470588,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7647058823529411,0.4117647058823529,0.4627450980392157,0.9568627450980393,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.4549019607843137,0.39215686274509803,0.7568627450980392,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.8666666666666667,0.5215686274509804,0.5568627450980392,0.9764705882352941,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.5803921568627451,0.596078431372549,0.8784313725490196,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x16","yaxis":"y16","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"10","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0,0.9098039215686274,1.0,1.0,0.8901960784313725,0.7529411764705882,0.792156862745098,0.8117647058823529,0.7176470588235294,0.8470588235294118,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9764705882352941,1.0,1.0,1.0,0.9803921568627451,1.0,0.611764705882353,0.5764705882352941,0.4470588235294118,0.7686274509803922,0.2196078431372549,0.8392156862745098,0.5725490196078431,0.00392156862745098,0.2980392156862745,0.26666666666666666,0.25098039215686274,0.23921568627450981,0.3058823529411765,0.2980392156862745,0.3137254901960784,0.44313725490196076,0.7490196078431373],[1.0,0.996078431372549,1.0,0.9882352941176471,1.0,1.0,1.0,0.8549019607843137,0.596078431372549,0.32941176470588235,0.4470588235294118,0.3607843137254902,0.3843137254901961,0.3254901960784314,0.6705882352941176,0.9254901960784314,0.0,1.0,1.0,1.0,0.9647058823529412,0.2901960784313726,0.6941176470588235,0.9764705882352941,0.6274509803921569,0.4666666666666667,0.4235294117647059,0.6862745098039216],[1.0,1.0,0.9921568627450981,1.0,1.0,0.5686274509803921,0.2784313725490196,0.3137254901960784,0.33725490196078434,0.3058823529411765,0.4117647058823529,0.396078431372549,0.5490196078431373,0.4549019607843137,0.3333333333333333,0.8705882352941177,0.2235294117647059,0.6274509803921569,1.0,1.0,0.996078431372549,0.27450980392156865,0.9725490196078431,1.0,1.0,1.0,1.0,1.0],[0.9882352941176471,0.9882352941176471,1.0,1.0,0.4588235294117647,0.5725490196078431,0.42745098039215684,0.4549019607843137,0.43529411764705883,0.3568627450980392,0.4627450980392157,0.3333333333333333,0.5098039215686274,0.5372549019607843,0.4980392156862745,0.39215686274509803,0.41568627450980394,0.3058823529411765,1.0,1.0,0.9411764705882353,0.37254901960784315,0.9372549019607843,1.0,0.9803921568627451,1.0,0.9882352941176471,0.9764705882352941],[1.0,1.0,1.0,0.8549019607843137,0.2823529411764706,0.49411764705882355,0.403921568627451,0.2901960784313726,0.47843137254901963,0.4117647058823529,0.23529411764705882,0.6196078431372549,0.6274509803921569,0.2823529411764706,0.30196078431372547,0.34901960784313724,0.32941176470588235,0.3411764705882353,1.0,1.0,0.8627450980392157,0.37254901960784315,0.9882352941176471,1.0,0.996078431372549,1.0,1.0,0.9921568627450981],[0.9254901960784314,0.8941176470588236,1.0,0.44313725490196076,0.21176470588235294,0.23921568627450981,0.22745098039215686,0.2,0.2823529411764706,0.2784313725490196,1.0,0.15294117647058825,0.23921568627450981,0.2196078431372549,0.2196078431372549,0.23529411764705882,0.2784313725490196,0.2,0.8509803921568627,1.0,0.7215686274509804,0.21176470588235294,1.0,1.0,1.0,1.0,1.0,1.0],[0.796078431372549,0.6078431372549019,0.6549019607843137,0.4980392156862745,0.5843137254901961,0.4627450980392157,0.47843137254901963,0.42745098039215684,0.43137254901960786,0.2549019607843137,0.788235294117647,0.5529411764705883,0.18823529411764706,0.36470588235294116,0.3607843137254902,0.32941176470588235,0.44313725490196076,0.4,0.5450980392156862,0.615686274509804,0.5529411764705883,0.47058823529411764,0.6274509803921569,0.5333333333333333,0.5137254901960784,0.5215686274509804,0.48627450980392156,0.7137254901960784],[1.0,1.0,0.9725490196078431,0.9921568627450981,0.9803921568627451,0.996078431372549,0.9725490196078431,0.984313725490196,0.984313725490196,0.9882352941176471,0.8392156862745098,0.9372549019607843,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9294117647058824,0.9019607843137255,0.9098039215686274,0.9803921568627451,0.8862745098039215,0.8745098039215686,0.8745098039215686,0.8823529411764706,0.8705882352941177,0.9921568627450981],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x17","yaxis":"y17","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"11","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.7686274509803922,0.6901960784313725,1.0,0.8235294117647058,0.9568627450980393,1.0,0.984313725490196,0.9921568627450981,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,0.9803921568627451,1.0,0.34901960784313724,0.396078431372549,0.396078431372549,0.34901960784313724,1.0,0.996078431372549,0.996078431372549,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9803921568627451,0.9921568627450981,1.0,0.996078431372549,0.9882352941176471,0.9725490196078431,1.0,1.0,0.8470588235294118,0.7803921568627451,0.788235294117647,0.26666666666666666,1.0,0.9411764705882353,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7411764705882353,0.23137254901960785,0.33725490196078434,1.0,0.9686274509803922,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9921568627450981,0.4549019607843137,0.5333333333333333,0.4666666666666667,0.4470588235294118,0.38823529411764707,0.3568627450980392,0.27450980392156865,0.20392156862745098,0.11764705882352941,0.16470588235294117,0.0,0.18823529411764706,1.0,1.0,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.16470588235294117,0.0,0.0196078431372549,0.00392156862745098,0.0,0.00392156862745098,0.0,0.00392156862745098,0.00392156862745098,0.027450980392156862,0.050980392156862744,0.01568627450980392,0.8509803921568627,1.0,0.9764705882352941,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.20784313725490197,0.0,0.0392156862745098,0.0392156862745098,0.06274509803921569,0.03137254901960784,0.0392156862745098,0.054901960784313725,0.058823529411764705,0.058823529411764705,0.03137254901960784,0.00392156862745098,0.2549019607843137,1.0,0.9764705882352941,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.30980392156862746,0.11372549019607843,0.1411764705882353,0.03529411764705882,0.00392156862745098,0.0,0.0,0.0,0.0392156862745098,0.050980392156862744,0.11764705882352941,0.5294117647058824,0.17254901960784313,0.9215686274509803,1.0,0.984313725490196,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.9686274509803922,0.3215686274509804,0.023529411764705882,0.5568627450980392,1.0,1.0,1.0,0.00392156862745098,0.058823529411764705,0.9019607843137255,0.7372549019607844,0.5843137254901961,0.6666666666666666,1.0,0.9607843137254902,0.996078431372549],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5686274509803921,0.12549019607843137,0.0,0.7333333333333333,1.0,1.0,1.0,1.0,0.1450980392156863,0.0196078431372549,1.0,0.48627450980392156,0.5098039215686274,0.7137254901960784,1.0,0.984313725490196,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,0.08627450980392157,0.1607843137254902,0.9803921568627451,1.0,0.9921568627450981,0.996078431372549,1.0,0.8666666666666667,0.27450980392156865,0.0,0.7725490196078432,0.9647058823529412,0.17647058823529413,0.9529411764705882,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.8313725490196079,0.0,0.6627450980392157,1.0,1.0,1.0,0.996078431372549,1.0,0.7725490196078432,0.28627450980392155,0.0,0.24313725490196078,1.0,1.0,1.0,1.0,0.8666666666666667,0.21568627450980393],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.3254901960784314,0.047058823529411764,0.5019607843137255,1.0,1.0,1.0,1.0,0.7058823529411765,0.2627450980392157,0.0392156862745098,0.0,1.0,1.0,0.9921568627450981,0.9490196078431372,0.16862745098039217,0.2901960784313726],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,0.4549019607843137,0.011764705882352941,0.11372549019607843,0.8470588235294118,1.0,1.0,0.6431372549019608,0.23529411764705882,0.0392156862745098,0.01568627450980392,0.6235294117647059,1.0,0.5882352941176471,0.10196078431372549,0.0,0.5019607843137255],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9294117647058824,0.12941176470588237,0.08627450980392157,0.4588235294117647,0.8392156862745098,0.7607843137254902,0.27058823529411763,0.011764705882352941,0.011764705882352941,0.2627450980392157,0.4588235294117647,0.011764705882352941,0.050980392156862744,0.03137254901960784,0.788235294117647],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.9725490196078431,0.4588235294117647,0.7098039215686275,0.6352941176470588,0.7725490196078432,0.7254901960784313,0.2,0.043137254901960784,0.1607843137254902,0.792156862745098,0.3137254901960784,0.00784313725490196,0.07450980392156863,0.00784313725490196,0.0,0.054901960784313725,0.058823529411764705,0.08627450980392157,0.9686274509803922],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9764705882352941,1.0,0.9137254901960784,0.0,0.0196078431372549,0.1607843137254902,0.19215686274509805,0.45098039215686275,0.4196078431372549,0.07058823529411765,0.0392156862745098,0.0784313725490196,0.07450980392156863,0.06274509803921569,0.01568627450980392,0.03137254901960784,0.7647058823529411,1.0,0.08235294117647059,0.10588235294117647,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9686274509803922,1.0,0.5686274509803921,0.19215686274509805,0.6509803921568628,0.23921568627450981,0.03137254901960784,0.0,0.0196078431372549,0.0,0.0196078431372549,0.011764705882352941,0.06274509803921569,0.011764705882352941,0.1607843137254902,1.0,1.0,1.0,0.11764705882352941,0.1607843137254902,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9686274509803922,1.0,0.3411764705882353,0.10588235294117647,1.0,1.0,1.0,1.0,0.9215686274509803,0.8117647058823529,0.5372549019607843,0.33725490196078434,0.011764705882352941,0.00392156862745098,1.0,1.0,1.0,1.0,0.1411764705882353,0.1803921568627451,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,0.9647058823529412,1.0,1.0,0.00392156862745098,0.0,0.7686274509803922,1.0,1.0,1.0,1.0,0.9607843137254902,0.2980392156862745,0.0392156862745098,0.8745098039215686,1.0,0.9803921568627451,1.0,1.0,0.17647058823529413,0.19215686274509805,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,0.13725490196078433,0.0,0.33725490196078434,1.0,1.0,0.984313725490196,0.3058823529411765,0.0,0.43529411764705883,1.0,0.9450980392156862,1.0,1.0,1.0,0.20784313725490197,0.20784313725490197,1.0],[1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,0.4745098039215686,0.21176470588235294,0.5019607843137255,0.9176470588235294,0.7254901960784313,0.16470588235294117,0.00784313725490196,0.011764705882352941,0.7647058823529411,0.9647058823529412,0.10980392156862745,0.1411764705882353,1.0,1.0,0.996078431372549,1.0,0.9686274509803922,1.0,0.25098039215686274,0.21176470588235294,1.0],[1.0,0.984313725490196,0.9607843137254902,0.996078431372549,1.0,0.8901960784313725,0.22745098039215686,0.027450980392156862,0.054901960784313725,0.09803921568627451,0.19607843137254902,0.42745098039215684,0.2627450980392157,0.054901960784313725,0.047058823529411764,0.03137254901960784,0.24705882352941178,0.00392156862745098,0.6392156862745098,1.0,0.9647058823529412,1.0,1.0,0.9686274509803922,1.0,0.2901960784313726,0.2,1.0],[1.0,1.0,1.0,1.0,1.0,0.4980392156862745,0.09411764705882353,0.12549019607843137,0.3607843137254902,0.0,0.00392156862745098,0.0,0.0,0.054901960784313725,0.07450980392156863,0.08235294117647059,0.011764705882352941,0.1411764705882353,1.0,1.0,0.9882352941176471,1.0,1.0,0.9529411764705882,1.0,0.3215686274509804,0.20392156862745098,1.0],[1.0,0.7333333333333333,0.36470588235294116,0.3764705882352941,0.3843137254901961,0.18823529411764706,0.10588235294117647,0.027450980392156862,0.792156862745098,0.592156862745098,0.0,0.11372549019607843,0.07450980392156863,0.0784313725490196,0.050980392156862744,0.08627450980392157,0.00784313725490196,0.5529411764705883,1.0,0.9803921568627451,1.0,0.996078431372549,1.0,0.9647058823529412,1.0,0.3568627450980392,0.2,1.0],[1.0,0.5372549019607843,0.0,0.0,0.0,0.06274509803921569,0.09019607843137255,0.06666666666666667,0.3058823529411765,0.8313725490196079,0.023529411764705882,0.09803921568627451,0.06666666666666667,0.06666666666666667,0.09019607843137255,0.00392156862745098,0.07058823529411765,1.0,1.0,0.996078431372549,1.0,1.0,1.0,0.9607843137254902,1.0,0.3803921568627451,0.18823529411764706,1.0],[1.0,1.0,0.8588235294117647,0.5490196078431373,0.28627450980392155,0.12549019607843137,0.06666666666666667,0.027450980392156862,0.0,0.0,0.0,0.0,0.0,0.03137254901960784,0.00392156862745098,0.08627450980392157,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,0.9607843137254902,1.0,0.4549019607843137,0.23529411764705882,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9490196078431372,0.7686274509803922,0.6509803921568628,0.5725490196078431,0.42745098039215684,0.32941176470588235,0.30196078431372547,0.3215686274509804,0.5764705882352941,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.8901960784313725,0.8196078431372549,1.0]],"type":"heatmap","xaxis":"x18","yaxis":"y18","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"12","z":[[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,0.7647058823529411,0.5764705882352941,0.6901960784313725,0.8784313725490196,1.0,1.0,1.0,1.0,1.0,1.0,0.9686274509803922,0.6941176470588235,0.5529411764705883,0.9254901960784314,1.0,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.8705882352941177,0.1568627450980392,0.38823529411764707,0.23921568627450981,0.4235294117647059,0.4392156862745098,0.44313725490196076,0.5294117647058824,0.8117647058823529,0.6941176470588235,0.615686274509804,0.45098039215686275,0.43137254901960786,0.2901960784313726,0.40784313725490196,0.3803921568627451,0.6274509803921569,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.35294117647058826,0.38823529411764707,0.49019607843137253,0.5490196078431373,0.4745098039215686,0.5803921568627451,0.5843137254901961,0.5098039215686274,0.4235294117647059,0.29411764705882354,0.44313725490196076,0.4745098039215686,0.4588235294117647,0.44313725490196076,0.5215686274509804,0.43529411764705883,0.39215686274509803,0.8941176470588236,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9333333333333333,0.3215686274509804,0.3764705882352941,0.37254901960784315,0.49019607843137253,0.4392156862745098,0.48627450980392156,0.5450980392156862,0.592156862745098,0.5333333333333333,0.4666666666666667,0.5450980392156862,0.5254901960784314,0.4117647058823529,0.5333333333333333,0.4980392156862745,0.5137254901960784,0.41568627450980394,0.6745098039215687,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6274509803921569,0.28627450980392155,0.34509803921568627,0.3333333333333333,0.3843137254901961,0.5333333333333333,0.40784313725490196,0.47058823529411764,0.5372549019607843,0.6431372549019608,0.40784313725490196,0.43529411764705883,0.5725490196078431,0.4666666666666667,0.42745098039215684,0.47058823529411764,0.5019607843137255,0.44313725490196076,0.45098039215686275,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.40784313725490196,0.2784313725490196,0.3411764705882353,0.32941176470588235,0.396078431372549,0.47843137254901963,0.49411764705882355,0.47058823529411764,0.5254901960784314,0.5372549019607843,0.43529411764705883,0.47843137254901963,0.45098039215686275,0.4666666666666667,0.4627450980392157,0.49019607843137253,0.5764705882352941,0.4823529411764706,0.40784313725490196,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.3254901960784314,0.3333333333333333,0.2901960784313726,0.36470588235294116,0.30980392156862746,0.47058823529411764,0.6039215686274509,0.5294117647058824,0.5019607843137255,0.4392156862745098,0.47058823529411764,0.44313725490196076,0.5254901960784314,0.5254901960784314,0.45098039215686275,0.5294117647058824,0.49411764705882355,0.4196078431372549,0.4588235294117647,0.9529411764705882,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.23529411764705882,0.3607843137254902,0.3058823529411765,0.3215686274509804,0.3607843137254902,0.37254901960784315,0.4666666666666667,0.38823529411764707,0.43137254901960786,0.35294117647058826,0.4117647058823529,0.396078431372549,0.32941176470588235,0.396078431372549,0.4549019607843137,0.611764705882353,0.47843137254901963,0.5176470588235295,0.3764705882352941,0.8705882352941177,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9803921568627451,0.19607843137254902,0.37254901960784315,0.1843137254901961,0.3176470588235294,0.3215686274509804,0.2823529411764706,0.3764705882352941,0.35294117647058826,0.403921568627451,0.41568627450980394,0.4392156862745098,0.44313725490196076,0.34901960784313724,0.4392156862745098,0.4627450980392157,0.47843137254901963,0.4823529411764706,0.5176470588235295,0.3215686274509804,0.803921568627451,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.8431372549019608,0.18823529411764706,0.3686274509803922,0.25882352941176473,0.396078431372549,0.3058823529411765,0.34509803921568627,0.4549019607843137,0.4,0.5137254901960784,0.44313725490196076,0.3843137254901961,0.48627450980392156,0.3607843137254902,0.4392156862745098,0.4392156862745098,0.4235294117647059,0.4117647058823529,0.4980392156862745,0.47843137254901963,0.7764705882352941,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7450980392156863,0.2627450980392157,0.3215686274509804,0.2627450980392157,0.3333333333333333,0.2980392156862745,0.2823529411764706,0.42745098039215684,0.45098039215686275,0.3568627450980392,0.3568627450980392,0.3686274509803922,0.4823529411764706,0.37254901960784315,0.5019607843137255,0.4117647058823529,0.5686274509803921,0.40784313725490196,0.5372549019607843,0.44313725490196076,0.7450980392156863,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6666666666666666,0.3176470588235294,0.36470588235294116,0.12549019607843137,0.3333333333333333,0.3215686274509804,0.24705882352941178,0.40784313725490196,0.35294117647058826,0.4196078431372549,0.34509803921568627,0.48627450980392156,0.35294117647058826,0.35294117647058826,0.4392156862745098,0.38823529411764707,0.47058823529411764,0.32941176470588235,0.5529411764705883,0.42745098039215684,0.7411764705882353,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.592156862745098,0.25098039215686274,0.24705882352941178,0.1607843137254902,0.3215686274509804,0.3568627450980392,0.2823529411764706,0.45098039215686275,0.39215686274509803,0.49019607843137253,0.396078431372549,0.4392156862745098,0.43137254901960786,0.34509803921568627,0.49411764705882355,0.3686274509803922,0.5019607843137255,0.396078431372549,0.5294117647058824,0.38823529411764707,0.6745098039215687,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.44313725490196076,0.36470588235294116,0.3176470588235294,0.19215686274509805,0.3058823529411765,0.3607843137254902,0.19215686274509805,0.44313725490196076,0.3607843137254902,0.34901960784313724,0.47843137254901963,0.45098039215686275,0.4117647058823529,0.32941176470588235,0.4,0.2784313725490196,0.5254901960784314,0.30196078431372547,0.5098039215686274,0.4392156862745098,0.592156862745098,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.3215686274509804,0.2627450980392157,0.2549019607843137,0.1411764705882353,0.23921568627450981,0.3333333333333333,0.2784313725490196,0.4549019607843137,0.36470588235294116,0.43137254901960786,0.4823529411764706,0.4196078431372549,0.42745098039215684,0.3607843137254902,0.36470588235294116,0.30980392156862746,0.5176470588235295,0.2901960784313726,0.4117647058823529,0.3176470588235294,0.6823529411764706,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.7647058823529411,0.30196078431372547,0.0,0.43529411764705883,0.23921568627450981,0.3137254901960784,0.4666666666666667,0.3215686274509804,0.3607843137254902,0.3686274509803922,0.40784313725490196,0.4196078431372549,0.41568627450980394,0.3843137254901961,0.23137254901960785,0.5529411764705883,0.21568627450980393,0.6862745098039216,0.9568627450980393,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3764705882352941,0.17254901960784313,0.3411764705882353,0.47058823529411764,0.43529411764705883,0.48627450980392156,0.27450980392156865,0.47058823529411764,0.2627450980392157,0.37254901960784315,0.3215686274509804,0.1607843137254902,0.4823529411764706,0.6745098039215687,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.17254901960784313,0.20784313725490197,0.2980392156862745,0.4980392156862745,0.32941176470588235,0.4,0.40784313725490196,0.4666666666666667,0.24705882352941178,0.5372549019607843,0.3137254901960784,0.2901960784313726,0.43529411764705883,0.6784313725490196,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8823529411764706,0.26666666666666666,0.27058823529411763,0.28627450980392155,0.5411764705882353,0.4392156862745098,0.36470588235294116,0.43529411764705883,0.4,0.35294117647058826,0.4745098039215686,0.22745098039215686,0.25882352941176473,0.4627450980392157,0.615686274509804,1.0,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9921568627450981,0.9882352941176471,1.0,0.7725490196078432,0.25882352941176473,0.21176470588235294,0.40784313725490196,0.4549019607843137,0.4117647058823529,0.34901960784313724,0.36470588235294116,0.4196078431372549,0.3215686274509804,0.5254901960784314,0.30196078431372547,0.20784313725490197,0.5058823529411764,0.4392156862745098,1.0,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.6627450980392157,0.23137254901960785,0.23529411764705882,0.403921568627451,0.45098039215686275,0.4117647058823529,0.48627450980392156,0.3803921568627451,0.43529411764705883,0.3607843137254902,0.5529411764705883,0.30196078431372547,0.2,0.5058823529411764,0.4196078431372549,1.0,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4392156862745098,0.20784313725490197,0.25882352941176473,0.36470588235294116,0.5294117647058824,0.39215686274509803,0.4666666666666667,0.3803921568627451,0.32941176470588235,0.3333333333333333,0.5019607843137255,0.3686274509803922,0.1411764705882353,0.5294117647058824,0.34509803921568627,0.9764705882352941,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.34901960784313724,0.2784313725490196,0.2196078431372549,0.3686274509803922,0.4666666666666667,0.4117647058823529,0.43529411764705883,0.2823529411764706,0.3803921568627451,0.38823529411764707,0.5607843137254902,0.37254901960784315,0.12549019607843137,0.45098039215686275,0.34901960784313724,0.9372549019607843,1.0,0.9882352941176471,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.27450980392156865,0.27450980392156865,0.2627450980392157,0.37254901960784315,0.403921568627451,0.4627450980392157,0.4666666666666667,0.48627450980392156,0.3843137254901961,0.4627450980392157,0.4745098039215686,0.39215686274509803,0.12941176470588237,0.4392156862745098,0.3843137254901961,0.9215686274509803,1.0,0.9803921568627451,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.9450980392156862,0.2784313725490196,0.2549019607843137,0.24705882352941178,0.4588235294117647,0.4588235294117647,0.3686274509803922,0.37254901960784315,0.3215686274509804,0.28627450980392155,0.27450980392156865,0.4117647058823529,0.41568627450980394,0.17254901960784313,0.47058823529411764,0.36470588235294116,0.8313725490196079,1.0,0.9882352941176471,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8470588235294118,0.24313725490196078,0.3411764705882353,0.26666666666666666,0.3058823529411765,0.4235294117647059,0.5607843137254902,0.49019607843137253,0.5803921568627451,0.615686274509804,0.7098039215686275,0.5803921568627451,0.14901960784313725,0.16862745098039217,0.396078431372549,0.45098039215686275,0.7411764705882353,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.6745098039215687,0.23921568627450981,0.30196078431372547,0.2901960784313726,0.27450980392156865,0.9333333333333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.47843137254901963,0.396078431372549,0.4823529411764706,0.6666666666666666,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.5058823529411764,0.36470588235294116,0.6980392156862745,0.9411764705882353,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,0.37254901960784315,0.6862745098039216,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x7","yaxis":"y7","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"13","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,0.9921568627450981,1.0,1.0,0.592156862745098,0.5019607843137255,0.592156862745098,0.3568627450980392,0.7764705882352941,1.0,1.0,0.996078431372549,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9921568627450981,1.0,1.0,1.0,1.0,0.5215686274509804,0.2196078431372549,0.43529411764705883,0.40784313725490196,0.23529411764705882,0.011764705882352941,0.8392156862745098,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.9882352941176471,0.6352941176470588,0.4470588235294118,0.2235294117647059,0.23529411764705882,0.5529411764705883,0.9607843137254902,0.396078431372549,0.21568627450980393,0.30196078431372547,0.3333333333333333,0.7490196078431373,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5529411764705883,0.30196078431372547,0.3137254901960784,0.3333333333333333,0.3686274509803922,0.34901960784313724,0.29411764705882354,0.2235294117647059,0.2,0.2235294117647059,0.2627450980392157,0.20784313725490197,0.16470588235294117,0.3843137254901961,0.8431372549019608,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.4117647058823529,0.20392156862745098,0.27058823529411763,0.44313725490196076,0.33725490196078434,0.22745098039215686,0.32941176470588235,0.45098039215686275,0.34901960784313724,0.2549019607843137,0.2,0.3176470588235294,0.4666666666666667,0.3411764705882353,0.27058823529411763,0.30980392156862746,0.9294117647058824,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.7333333333333333,0.2627450980392157,0.3254901960784314,0.5450980392156862,0.38823529411764707,0.11372549019607843,0.2549019607843137,0.4823529411764706,0.25882352941176473,0.4549019607843137,0.34509803921568627,0.12156862745098039,0.4117647058823529,0.5372549019607843,0.25882352941176473,0.2,0.27450980392156865,0.6352941176470588,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.4470588235294118,0.38823529411764707,0.3568627450980392,0.5137254901960784,0.48627450980392156,0.2823529411764706,0.40784313725490196,0.6078431372549019,0.36470588235294116,0.5254901960784314,0.4745098039215686,0.23529411764705882,0.5176470588235295,0.47058823529411764,0.35294117647058826,0.2901960784313726,0.2627450980392157,0.41568627450980394,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.23137254901960785,0.3176470588235294,0.25098039215686274,0.3137254901960784,0.5529411764705883,0.42745098039215684,0.08627450980392157,0.3058823529411765,0.21568627450980393,0.34901960784313724,0.25882352941176473,0.4470588235294118,0.5607843137254902,0.23137254901960785,0.18823529411764706,0.20784313725490197,0.2,0.34901960784313724,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.21568627450980393,0.4,0.2784313725490196,0.25882352941176473,0.2784313725490196,0.38823529411764707,0.1843137254901961,0.4235294117647059,0.4196078431372549,0.5333333333333333,0.3215686274509804,0.30196078431372547,0.36470588235294116,0.3686274509803922,0.18823529411764706,0.23137254901960785,0.2901960784313726,0.4588235294117647,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.1411764705882353,0.49019607843137253,0.3215686274509804,0.41568627450980394,0.27450980392156865,0.1568627450980392,0.32941176470588235,0.5333333333333333,0.403921568627451,0.5764705882352941,0.42745098039215684,0.06666666666666667,0.26666666666666666,0.45098039215686275,0.41568627450980394,0.3686274509803922,0.30196078431372547,0.34509803921568627,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9882352941176471,0.0196078431372549,0.3686274509803922,0.11372549019607843,0.34509803921568627,0.5176470588235295,0.1568627450980392,0.15294117647058825,0.4,0.19215686274509805,0.3568627450980392,0.2784313725490196,0.47058823529411764,0.26666666666666666,0.27058823529411763,0.29411764705882354,0.23529411764705882,0.19215686274509805,0.25098039215686274,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9686274509803922,0.1411764705882353,0.5411764705882353,0.19607843137254902,0.4235294117647059,0.48627450980392156,0.27058823529411763,0.33725490196078434,0.5529411764705883,0.33725490196078434,0.49019607843137253,0.36470588235294116,0.44313725490196076,0.25882352941176473,0.12941176470588237,0.23137254901960785,0.3254901960784314,0.2549019607843137,0.3803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.8941176470588236,0.16862745098039217,0.6392156862745098,0.2549019607843137,0.396078431372549,0.5254901960784314,0.2627450980392157,0.32941176470588235,0.6039215686274509,0.4196078431372549,0.6627450980392157,0.39215686274509803,0.30196078431372547,0.592156862745098,0.5843137254901961,0.36470588235294116,0.4549019607843137,0.2901960784313726,0.3254901960784314,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.796078431372549,0.0,0.43529411764705883,0.11372549019607843,0.2627450980392157,0.39215686274509803,0.10588235294117647,0.18823529411764706,0.4235294117647059,0.2,0.396078431372549,0.21176470588235294,0.1568627450980392,0.4117647058823529,0.3333333333333333,0.18823529411764706,0.2980392156862745,0.17647058823529413,0.20392156862745098,0.9019607843137255,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.807843137254902,0.15294117647058825,0.592156862745098,0.18823529411764706,0.34509803921568627,0.5647058823529412,0.25098039215686274,0.2823529411764706,0.5647058823529412,0.3333333333333333,0.49411764705882355,0.32941176470588235,0.22745098039215686,0.5333333333333333,0.4,0.20784313725490197,0.37254901960784315,0.24705882352941178,0.3803921568627451,0.8705882352941177,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6980392156862745,0.1803921568627451,0.6235294117647059,0.23137254901960785,0.27058823529411763,0.5490196078431373,0.26666666666666666,0.3176470588235294,0.5882352941176471,0.42745098039215684,0.6352941176470588,0.3764705882352941,0.3411764705882353,0.6313725490196078,0.5254901960784314,0.34509803921568627,0.4196078431372549,0.2784313725490196,0.3411764705882353,0.7529411764705882,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6039215686274509,0.043137254901960784,0.4823529411764706,0.06666666666666667,0.18823529411764706,0.42745098039215684,0.11764705882352941,0.16470588235294117,0.4,0.19607843137254902,0.43529411764705883,0.1843137254901961,0.12941176470588237,0.4470588235294118,0.29411764705882354,0.1411764705882353,0.25882352941176473,0.17254901960784313,0.23921568627450981,0.6784313725490196,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6588235294117647,0.20392156862745098,0.611764705882353,0.09411764705882353,0.3254901960784314,0.5647058823529412,0.2235294117647059,0.30980392156862746,0.5764705882352941,0.30196078431372547,0.5058823529411764,0.32941176470588235,0.20784313725490197,0.5568627450980392,0.3607843137254902,0.1803921568627451,0.3686274509803922,0.2235294117647059,0.4,0.7058823529411765,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5882352941176471,0.18823529411764706,0.6039215686274509,0.16470588235294117,0.24313725490196078,0.5294117647058824,0.24313725490196078,0.37254901960784315,0.596078431372549,0.37254901960784315,0.5803921568627451,0.396078431372549,0.3333333333333333,0.6509803921568628,0.44313725490196076,0.3411764705882353,0.40784313725490196,0.19607843137254902,0.3686274509803922,0.6627450980392157,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5137254901960784,0.0784313725490196,0.41568627450980394,0.21568627450980393,0.4235294117647059,0.32941176470588235,0.07058823529411765,0.23137254901960785,0.36470588235294116,0.2627450980392157,0.44313725490196076,0.12549019607843137,0.1568627450980392,0.4470588235294118,0.24313725490196078,0.18823529411764706,0.403921568627451,0.0196078431372549,0.2627450980392157,0.5882352941176471,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5607843137254902,0.30196078431372547,0.5176470588235295,0.5411764705882353,0.7607843137254902,0.403921568627451,0.1843137254901961,0.39215686274509803,0.5176470588235295,0.3058823529411765,0.5568627450980392,0.25882352941176473,0.25882352941176473,0.5568627450980392,0.3686274509803922,0.2549019607843137,0.5450980392156862,0.0784313725490196,0.4470588235294118,0.6627450980392157,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4588235294117647,0.2823529411764706,0.396078431372549,0.7215686274509804,0.6431372549019608,0.30196078431372547,0.20784313725490197,0.4392156862745098,0.5215686274509804,0.3254901960784314,0.596078431372549,0.3176470588235294,0.33725490196078434,0.6470588235294118,0.48627450980392156,0.3568627450980392,0.5843137254901961,0.058823529411764705,0.4,0.6274509803921569,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4117647058823529,0.23137254901960785,0.18823529411764706,0.803921568627451,0.5294117647058824,0.17254901960784313,0.13725490196078433,0.27058823529411763,0.3215686274509804,0.20392156862745098,0.37254901960784315,0.08235294117647059,0.13725490196078433,0.4117647058823529,0.24313725490196078,0.1803921568627451,0.6745098039215687,0.09411764705882353,0.23529411764705882,0.596078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4980392156862745,0.48627450980392156,0.35294117647058826,0.9803921568627451,0.5215686274509804,0.3137254901960784,0.22745098039215686,0.41568627450980394,0.49411764705882355,0.34901960784313724,0.5607843137254902,0.27450980392156865,0.30196078431372547,0.5215686274509804,0.33725490196078434,0.16470588235294117,0.8352941176470589,0.16470588235294117,0.34509803921568627,0.7019607843137254,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.47058823529411764,0.37254901960784315,0.3137254901960784,1.0,0.5450980392156862,0.17647058823529413,0.23137254901960785,0.40784313725490196,0.3686274509803922,0.34509803921568627,0.43529411764705883,0.2784313725490196,0.34509803921568627,0.5098039215686274,0.25098039215686274,0.2235294117647059,1.0,0.10588235294117647,0.3137254901960784,0.6823529411764706,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4980392156862745,0.32941176470588235,0.21568627450980393,1.0,1.0,0.5058823529411764,0.22745098039215686,0.3568627450980392,0.3176470588235294,0.23529411764705882,0.27450980392156865,0.23137254901960785,0.30980392156862746,0.30196078431372547,0.4392156862745098,1.0,1.0,0.0784313725490196,0.25882352941176473,0.5607843137254902,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4,0.39215686274509803,0.25882352941176473,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.17647058823529413,0.40784313725490196,0.5058823529411764,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7843137254901961,0.6549019607843137,0.6666666666666666,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.49019607843137253,0.5529411764705883,0.6549019607843137,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x8","yaxis":"y8","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"14","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.34509803921568627,0.00392156862745098,0.396078431372549,1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,0.26666666666666666,0.027450980392156862,0.10588235294117647,0.058823529411764705,0.11764705882352941,0.49411764705882355,0.9137254901960784,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2627450980392157,0.10196078431372549,0.1568627450980392,0.1568627450980392,0.15294117647058825,0.050980392156862744,0.0,0.03529411764705882,0.20784313725490197,0.2823529411764706,0.7686274509803922,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.8,0.0,0.20784313725490197,0.1411764705882353,0.09803921568627451,0.16470588235294117,0.2627450980392157,0.33725490196078434,0.21176470588235294,0.1450980392156863,0.14901960784313725,0.03529411764705882,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.0,0.19607843137254902,0.11764705882352941,0.18823529411764706,0.2627450980392157,0.06274509803921569,0.2196078431372549,0.3803921568627451,0.21176470588235294,0.22745098039215686,0.2784313725490196,0.09411764705882353,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.25882352941176473,0.0,0.1803921568627451,0.12156862745098039,0.1411764705882353,0.2549019607843137,0.14901960784313725,0.2627450980392157,0.4823529411764706,0.3568627450980392,0.2901960784313726,0.32941176470588235,0.08235294117647059,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9529411764705882,1.0,1.0,0.9803921568627451,0.24705882352941178,0.0,0.23137254901960785,0.12156862745098039,0.1450980392156863,0.18823529411764706,0.10196078431372549,0.10588235294117647,0.2235294117647059,0.25098039215686274,0.22745098039215686,0.27450980392156865,0.22745098039215686,0.0,0.9568627450980393],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.996078431372549,1.0,1.0,1.0,0.796078431372549,0.03529411764705882,0.07450980392156863,0.19215686274509805,0.10196078431372549,0.10588235294117647,0.19215686274509805,0.12156862745098039,0.06274509803921569,0.054901960784313725,0.011764705882352941,0.03137254901960784,0.00784313725490196,0.0392156862745098,0.09803921568627451,0.0,0.8745098039215686],[1.0,1.0,0.9921568627450981,0.984313725490196,0.984313725490196,1.0,1.0,1.0,1.0,0.9607843137254902,0.34901960784313724,0.0,0.1568627450980392,0.1411764705882353,0.13333333333333333,0.11372549019607843,0.24313725490196078,0.12549019607843137,0.06274509803921569,0.10980392156862745,0.2823529411764706,0.1450980392156863,0.08235294117647059,0.07058823529411765,0.13333333333333333,0.29411764705882354,0.0,0.7411764705882353],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8431372549019608,0.3411764705882353,0.0,0.08627450980392157,0.16862745098039217,0.17254901960784313,0.1803921568627451,0.10196078431372549,0.32941176470588235,0.13333333333333333,0.08235294117647059,0.0784313725490196,0.1607843137254902,0.21176470588235294,0.33725490196078434,0.1568627450980392,0.058823529411764705,0.12941176470588237,0.24313725490196078,0.0,0.49411764705882355],[1.0,1.0,1.0,0.6862745098039216,0.5450980392156862,0.3686274509803922,0.2627450980392157,0.0,0.06666666666666667,0.20392156862745098,0.12941176470588237,0.2901960784313726,0.08627450980392157,0.11372549019607843,0.1803921568627451,0.1411764705882353,0.047058823529411764,0.08235294117647059,0.06274509803921569,0.06274509803921569,0.2196078431372549,0.17647058823529413,0.11372549019607843,0.06666666666666667,0.07058823529411765,0.12156862745098039,0.0,0.4235294117647059],[1.0,0.5568627450980392,0.13333333333333333,0.09019607843137255,0.054901960784313725,0.07058823529411765,0.09803921568627451,0.1568627450980392,0.14901960784313725,0.25882352941176473,0.20392156862745098,0.11372549019607843,0.12941176470588237,0.21176470588235294,0.13333333333333333,0.06274509803921569,0.1843137254901961,0.2980392156862745,0.10196078431372549,0.058823529411764705,0.11372549019607843,0.058823529411764705,0.08627450980392157,0.09411764705882353,0.10196078431372549,0.13333333333333333,0.0,0.5803921568627451],[0.7490196078431373,0.12941176470588237,0.09803921568627451,0.1568627450980392,0.1568627450980392,0.13333333333333333,0.10196078431372549,0.17647058823529413,0.10588235294117647,0.0784313725490196,0.06666666666666667,0.03529411764705882,0.15294117647058825,0.13725490196078433,0.12941176470588237,0.11372549019607843,0.1843137254901961,0.25882352941176473,0.08627450980392157,0.03529411764705882,0.03137254901960784,0.0196078431372549,0.023529411764705882,0.0,0.0,0.09803921568627451,0.0,0.5098039215686274],[0.5176470588235295,0.20784313725490197,0.23529411764705882,0.1450980392156863,0.12549019607843137,0.0,0.3254901960784314,0.4666666666666667,0.615686274509804,0.3215686274509804,0.33725490196078434,0.20784313725490197,0.023529411764705882,0.050980392156862744,0.00392156862745098,0.0,0.00784313725490196,0.011764705882352941,0.050980392156862744,0.10196078431372549,0.16470588235294117,0.21568627450980393,0.25882352941176473,0.35294117647058826,0.43529411764705883,0.3568627450980392,0.1803921568627451,0.4549019607843137],[0.9725490196078431,0.17254901960784313,0.29411764705882354,0.2980392156862745,0.2901960784313726,0.1843137254901961,0.32941176470588235,0.40784313725490196,0.2235294117647059,0.2784313725490196,0.5843137254901961,0.49019607843137253,0.3176470588235294,0.1803921568627451,0.27450980392156865,0.33725490196078434,0.42745098039215684,0.4745098039215686,0.47843137254901963,0.4980392156862745,0.4627450980392157,0.39215686274509803,0.3686274509803922,0.3686274509803922,0.30980392156862746,0.1843137254901961,0.13333333333333333,0.8156862745098039],[1.0,0.9294117647058824,0.4235294117647059,0.26666666666666666,0.26666666666666666,0.3411764705882353,0.35294117647058826,0.34509803921568627,0.4470588235294118,0.4549019607843137,0.3803921568627451,0.3254901960784314,0.34509803921568627,0.30196078431372547,0.2784313725490196,0.27450980392156865,0.2784313725490196,0.29411764705882354,0.3254901960784314,0.3411764705882353,0.3686274509803922,0.43137254901960786,0.49411764705882355,0.5686274509803921,0.6666666666666666,0.796078431372549,1.0,1.0],[1.0,1.0,1.0,0.9607843137254902,0.7176470588235294,0.5215686274509804,0.40784313725490196,0.3176470588235294,0.5764705882352941,0.38823529411764707,0.36470588235294116,0.4196078431372549,0.5254901960784314,0.7019607843137254,0.792156862745098,0.9098039215686274,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x9","yaxis":"y9","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"15","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,1.0,1.0,1.0,1.0,0.9647058823529412,0.8745098039215686,0.5176470588235295,0.4627450980392157,1.0,1.0,0.996078431372549,1.0,1.0,0.9372549019607843,0.9176470588235294,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,0.6588235294117647,0.2627450980392157,0.3686274509803922,0.3333333333333333,0.6941176470588235,1.0,1.0,1.0,1.0,0.3058823529411765,0.30196078431372547,0.8862745098039215,1.0,0.996078431372549],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9176470588235294,0.29411764705882354,0.3803921568627451,0.37254901960784315,0.3607843137254902,0.40784313725490196,0.44313725490196076,0.7686274509803922,1.0,1.0,0.9647058823529412,0.396078431372549,0.43137254901960786,0.7725490196078432,1.0,0.9764705882352941],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,0.5333333333333333,0.24313725490196078,0.17647058823529413,0.2901960784313726,0.40784313725490196,0.5098039215686274,0.45098039215686275,0.0,0.23137254901960785,0.4392156862745098,0.3764705882352941,0.403921568627451,0.4627450980392157,0.5882352941176471,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,0.44313725490196076,0.596078431372549,0.4235294117647059,0.43137254901960786,0.45098039215686275,0.37254901960784315,0.32941176470588235,0.47058823529411764,0.39215686274509803,0.24705882352941178,0.32941176470588235,0.44313725490196076,0.4470588235294118,0.44313725490196076,0.3607843137254902,0.7490196078431373,1.0],[1.0,1.0,1.0,0.9921568627450981,0.996078431372549,1.0,1.0,1.0,1.0,0.7568627450980392,0.4,0.6313725490196078,0.42745098039215684,0.3607843137254902,0.3254901960784314,0.49019607843137253,0.45098039215686275,0.34901960784313724,0.2549019607843137,0.3607843137254902,0.49019607843137253,0.4392156862745098,0.4666666666666667,0.28627450980392155,0.34901960784313724,0.35294117647058826,0.6,1.0],[1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,0.9176470588235294,0.3843137254901961,0.5843137254901961,0.39215686274509803,0.6980392156862745,0.6509803921568628,0.611764705882353,0.596078431372549,0.47058823529411764,0.41568627450980394,0.4745098039215686,0.43137254901960786,0.33725490196078434,0.32941176470588235,0.4,0.3686274509803922,0.1803921568627451,0.8,0.6196078431372549,0.5333333333333333,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.8352941176470589,0.44313725490196076,0.49411764705882355,0.5137254901960784,0.29411764705882354,0.3843137254901961,0.5568627450980392,0.26666666666666666,0.5215686274509804,0.6078431372549019,0.48627450980392156,0.4392156862745098,0.4470588235294118,0.396078431372549,0.3254901960784314,0.5215686274509804,0.23921568627450981,0.6039215686274509,0.7372549019607844,0.7019607843137254,0.6901960784313725,1.0],[1.0,0.9254901960784314,0.6196078431372549,0.4666666666666667,0.48627450980392156,0.4666666666666667,0.2823529411764706,0.3803921568627451,0.4392156862745098,0.2784313725490196,0.396078431372549,0.4627450980392157,0.5882352941176471,0.49411764705882355,0.2901960784313726,0.45098039215686275,0.5803921568627451,0.5529411764705883,0.49019607843137253,0.5411764705882353,0.5843137254901961,0.6431372549019608,0.23921568627450981,0.7137254901960784,0.6313725490196078,0.6901960784313725,0.6313725490196078,1.0],[0.8862745098039215,0.4588235294117647,0.35294117647058826,0.3686274509803922,0.3803921568627451,0.3333333333333333,0.3568627450980392,0.2823529411764706,0.5843137254901961,0.5764705882352941,0.3764705882352941,0.49019607843137253,0.5411764705882353,0.5725490196078431,0.37254901960784315,0.396078431372549,0.47058823529411764,0.6274509803921569,0.5411764705882353,0.35294117647058826,0.2901960784313726,0.1803921568627451,0.09019607843137255,0.596078431372549,0.5529411764705883,0.5843137254901961,0.6705882352941176,0.9764705882352941],[0.5450980392156862,0.6431372549019608,0.6549019607843137,0.43137254901960786,0.3333333333333333,0.34901960784313724,0.403921568627451,0.3843137254901961,0.6274509803921569,0.6078431372549019,0.3686274509803922,0.5215686274509804,0.5803921568627451,0.49019607843137253,0.5725490196078431,0.37254901960784315,0.47058823529411764,0.2823529411764706,0.3254901960784314,0.5607843137254902,0.6509803921568628,0.9137254901960784,0.984313725490196,1.0,0.8941176470588236,0.9529411764705882,0.9725490196078431,0.7607843137254902],[0.5372549019607843,0.4980392156862745,0.5529411764705883,0.611764705882353,0.5803921568627451,0.5607843137254902,0.4196078431372549,0.40784313725490196,0.5098039215686274,0.6352941176470588,0.5176470588235295,0.3607843137254902,0.45098039215686275,0.2549019607843137,0.4,0.34901960784313724,0.2549019607843137,0.5333333333333333,0.9764705882352941,1.0,0.9921568627450981,0.7019607843137254,0.6470588235294118,0.6509803921568628,0.8352941176470589,0.9686274509803922,0.3333333333333333,0.611764705882353],[0.8784313725490196,0.6313725490196078,0.7098039215686275,0.5843137254901961,0.48627450980392156,0.403921568627451,0.4745098039215686,0.4823529411764706,0.43137254901960786,0.49411764705882355,0.4823529411764706,0.3176470588235294,0.403921568627451,0.5058823529411764,0.6313725490196078,0.8117647058823529,0.9647058823529412,1.0,0.5764705882352941,0.15294117647058825,0.12156862745098039,0.2196078431372549,0.08235294117647059,0.23137254901960785,0.10588235294117647,0.16862745098039217,0.3176470588235294,0.4470588235294118],[1.0,0.7647058823529411,0.5882352941176471,0.8274509803921568,0.9215686274509803,0.8784313725490196,0.9490196078431372,0.8862745098039215,0.8745098039215686,0.8941176470588236,0.8509803921568627,0.9529411764705882,0.984313725490196,1.0,0.996078431372549,0.9176470588235294,1.0,0.49411764705882355,0.1411764705882353,0.3254901960784314,0.08627450980392157,0.3843137254901961,0.16470588235294117,0.2784313725490196,0.20392156862745098,0.5137254901960784,0.7843137254901961,0.8156862745098039],[1.0,1.0,0.9254901960784314,0.7137254901960784,0.6,0.8,0.5803921568627451,0.8156862745098039,0.8352941176470589,0.8274509803921568,0.9137254901960784,0.8235294117647058,0.807843137254902,0.7686274509803922,0.7254901960784313,0.7686274509803922,0.7215686274509804,0.7411764705882353,0.396078431372549,0.403921568627451,0.32941176470588235,0.33725490196078434,0.3607843137254902,0.49411764705882355,0.6784313725490196,0.7686274509803922,0.9215686274509803,1.0],[1.0,1.0,1.0,1.0,1.0,0.9882352941176471,0.6745098039215687,0.8705882352941177,0.788235294117647,0.6196078431372549,1.0,0.6862745098039216,0.7647058823529411,0.8980392156862745,0.7725490196078432,0.9450980392156862,0.7725490196078432,0.9686274509803922,0.9686274509803922,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x10","yaxis":"y10","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"16","z":[[1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,0.984313725490196,0.996078431372549,1.0,1.0,0.40784313725490196,0.3803921568627451,0.6509803921568628,0.6705882352941176,0.5254901960784314,0.34901960784313724,1.0,1.0,1.0,0.9882352941176471,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,0.7764705882352941,0.0,0.14901960784313725,0.09411764705882353,0.09019607843137255,0.18823529411764706,0.08235294117647059,0.6078431372549019,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.596078431372549,0.3058823529411765,0.06274509803921569,0.11372549019607843,0.2627450980392157,0.12156862745098039,0.08235294117647059,0.0784313725490196,0.08627450980392157,0.22745098039215686,0.2,0.0,0.21176470588235294,0.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9568627450980393,0.1568627450980392,0.027450980392156862,0.047058823529411764,0.0784313725490196,0.08627450980392157,0.14901960784313725,0.1803921568627451,0.11764705882352941,0.16470588235294117,0.17254901960784313,0.2980392156862745,0.09411764705882353,0.09019607843137255,0.054901960784313725,0.0196078431372549,0.2235294117647059,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.2784313725490196,0.01568627450980392,0.13333333333333333,0.11372549019607843,0.10588235294117647,0.10980392156862745,0.08627450980392157,0.07058823529411765,0.11372549019607843,0.20392156862745098,0.07058823529411765,0.058823529411764705,0.08627450980392157,0.10196078431372549,0.11372549019607843,0.1450980392156863,0.023529411764705882,0.3764705882352941,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.15294117647058825,0.09019607843137255,0.10196078431372549,0.09803921568627451,0.10196078431372549,0.10196078431372549,0.10980392156862745,0.10980392156862745,0.09411764705882353,0.08627450980392157,0.10980392156862745,0.11372549019607843,0.12156862745098039,0.1803921568627451,0.11372549019607843,0.10980392156862745,0.09019607843137255,0.08627450980392157,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.08235294117647059,0.08627450980392157,0.10196078431372549,0.10196078431372549,0.09803921568627451,0.10588235294117647,0.10588235294117647,0.10196078431372549,0.10980392156862745,0.11372549019607843,0.10980392156862745,0.09411764705882353,0.12549019607843137,0.3176470588235294,0.17647058823529413,0.10588235294117647,0.10588235294117647,0.10588235294117647,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.796078431372549,0.0,0.08627450980392157,0.09019607843137255,0.09803921568627451,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10980392156862745,0.10980392156862745,0.10980392156862745,0.10980392156862745,0.0784313725490196,0.24705882352941178,0.08627450980392157,0.09803921568627451,0.10588235294117647,0.06274509803921569,0.788235294117647,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.592156862745098,0.047058823529411764,0.09803921568627451,0.08235294117647059,0.10196078431372549,0.10196078431372549,0.10980392156862745,0.10980392156862745,0.10588235294117647,0.10980392156862745,0.10980392156862745,0.10980392156862745,0.10588235294117647,0.09803921568627451,0.26666666666666666,0.1568627450980392,0.050980392156862744,0.10588235294117647,0.058823529411764705,0.5137254901960784,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.3333333333333333,0.06666666666666667,0.09803921568627451,0.07058823529411765,0.09803921568627451,0.08627450980392157,0.09803921568627451,0.10588235294117647,0.10588235294117647,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10196078431372549,0.09803921568627451,0.06274509803921569,0.10196078431372549,0.08235294117647059,0.11372549019607843,0.08627450980392157,0.2901960784313726,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.1843137254901961,0.03529411764705882,0.06274509803921569,0.058823529411764705,0.08627450980392157,0.09019607843137255,0.08235294117647059,0.09411764705882353,0.09803921568627451,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10196078431372549,0.11372549019607843,0.11372549019607843,0.08235294117647059,0.06666666666666667,0.047058823529411764,0.24705882352941178,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6470588235294118,0.23921568627450981,0.027450980392156862,0.00784313725490196,0.08235294117647059,0.09803921568627451,0.07450980392156863,0.09803921568627451,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10196078431372549,0.10980392156862745,0.09411764705882353,0.0,0.23137254901960785,0.5764705882352941,0.8862745098039215,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7803921568627451,0.058823529411764705,0.0784313725490196,0.09019607843137255,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.08627450980392157,0.09411764705882353,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.1803921568627451,0.050980392156862744,0.10196078431372549,0.10196078431372549,0.10196078431372549,0.10196078431372549,0.10196078431372549,0.10196078431372549,0.11372549019607843,0.10196078431372549,0.08235294117647059,0.12156862745098039,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.1568627450980392,0.054901960784313725,0.11372549019607843,0.09803921568627451,0.09803921568627451,0.09803921568627451,0.10196078431372549,0.10196078431372549,0.11372549019607843,0.11372549019607843,0.08235294117647059,0.08627450980392157,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.16862745098039217,0.06666666666666667,0.11372549019607843,0.09803921568627451,0.09803921568627451,0.09803921568627451,0.10588235294117647,0.10196078431372549,0.11372549019607843,0.11764705882352941,0.08235294117647059,0.24705882352941178,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.19215686274509805,0.07058823529411765,0.11372549019607843,0.09803921568627451,0.10196078431372549,0.09803921568627451,0.10588235294117647,0.10196078431372549,0.10980392156862745,0.12156862745098039,0.07450980392156863,0.23529411764705882,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2,0.07450980392156863,0.11372549019607843,0.09803921568627451,0.10196078431372549,0.09803921568627451,0.10588235294117647,0.10196078431372549,0.10196078431372549,0.12156862745098039,0.08235294117647059,0.2196078431372549,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.20392156862745098,0.06666666666666667,0.10980392156862745,0.10196078431372549,0.10196078431372549,0.09803921568627451,0.10588235294117647,0.10196078431372549,0.10588235294117647,0.11764705882352941,0.09019607843137255,0.2235294117647059,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.21176470588235294,0.06274509803921569,0.10980392156862745,0.10196078431372549,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.09803921568627451,0.11372549019607843,0.09803921568627451,0.20784313725490197,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.20784313725490197,0.06274509803921569,0.10980392156862745,0.10196078431372549,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.09803921568627451,0.2,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.20784313725490197,0.058823529411764705,0.10980392156862745,0.10196078431372549,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10980392156862745,0.09803921568627451,0.10196078431372549,0.19215686274509805,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.21176470588235294,0.058823529411764705,0.10980392156862745,0.09411764705882353,0.10588235294117647,0.10980392156862745,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.09803921568627451,0.18823529411764706,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2235294117647059,0.06666666666666667,0.10980392156862745,0.09019607843137255,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.09803921568627451,0.17647058823529413,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2235294117647059,0.06666666666666667,0.10588235294117647,0.08627450980392157,0.09411764705882353,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.10588235294117647,0.09411764705882353,0.16470588235294117,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3254901960784314,0.06666666666666667,0.11764705882352941,0.10588235294117647,0.12156862745098039,0.12549019607843137,0.12941176470588237,0.12549019607843137,0.12941176470588237,0.11372549019607843,0.10588235294117647,0.1607843137254902,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.22745098039215686,0.00784313725490196,0.06274509803921569,0.0,0.0,0.0,0.0,0.0,0.0,0.050980392156862744,0.047058823529411764,0.09803921568627451,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.47058823529411764,0.3176470588235294,0.42745098039215684,0.4117647058823529,0.4117647058823529,0.4117647058823529,0.4117647058823529,0.4117647058823529,0.4117647058823529,0.4196078431372549,0.34901960784313724,0.3058823529411765,1.0,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x11","yaxis":"y11","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"17","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8980392156862745,0.9294117647058824,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,0.16862745098039217,0.12156862745098039,0.2549019607843137,0.5490196078431373,0.6392156862745098,0.6627450980392157,0.6862745098039216,0.5490196078431373,0.3215686274509804,0.047058823529411764,0.3176470588235294,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9411764705882353,0.0784313725490196,0.09803921568627451,0.047058823529411764,0.03529411764705882,0.03137254901960784,0.03137254901960784,0.03137254901960784,0.03529411764705882,0.054901960784313725,0.07450980392156863,0.27450980392156865,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.11372549019607843,0.10980392156862745,0.12941176470588237,0.12156862745098039,0.12549019607843137,0.12156862745098039,0.11764705882352941,0.12156862745098039,0.13725490196078433,0.08235294117647059,0.2823529411764706,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.1411764705882353,0.11764705882352941,0.10588235294117647,0.10980392156862745,0.10980392156862745,0.10196078431372549,0.11764705882352941,0.09803921568627451,0.0784313725490196,0.08627450980392157,0.27450980392156865,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.12941176470588237,0.06274509803921569,0.09411764705882353,0.0,0.03137254901960784,0.12549019607843137,0.0,0.07058823529411765,0.3176470588235294,0.10196078431372549,0.23137254901960785,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9529411764705882,0.09803921568627451,0.6823529411764706,0.996078431372549,0.8705882352941177,0.11372549019607843,0.0784313725490196,0.22745098039215686,1.0,1.0,0.8745098039215686,0.0,1.0,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5803921568627451,0.058823529411764705,1.0,0.807843137254902,1.0,0.4549019607843137,0.0,0.6549019607843137,1.0,1.0,1.0,0.13333333333333333,0.5764705882352941,1.0,0.9686274509803922,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.18823529411764706,0.10196078431372549,0.9607843137254902,1.0,0.788235294117647,0.0,0.26666666666666666,0.0,0.8431372549019608,1.0,0.611764705882353,0.043137254901960784,0.3764705882352941,1.0,0.9686274509803922,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.32941176470588235,0.011764705882352941,0.13333333333333333,0.19607843137254902,0.0,0.3333333333333333,1.0,0.14901960784313725,0.0,0.047058823529411764,0.09803921568627451,0.06666666666666667,0.36470588235294116,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7411764705882353,0.0196078431372549,0.09019607843137255,0.08627450980392157,0.0,1.0,1.0,0.8784313725490196,0.06274509803921569,0.12549019607843137,0.13333333333333333,0.050980392156862744,0.592156862745098,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,0.06666666666666667,0.08235294117647059,0.08235294117647059,0.0,0.2980392156862745,0.0,0.23921568627450981,0.0,0.10588235294117647,0.12156862745098039,0.058823529411764705,0.6392156862745098,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.09411764705882353,0.07058823529411765,0.3333333333333333,0.3215686274509804,0.1843137254901961,0.20784313725490197,0.2196078431372549,0.4,0.37254901960784315,0.10980392156862745,0.07058823529411765,0.615686274509804,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9647058823529412,0.08235294117647059,0.09411764705882353,0.6039215686274509,0.7803921568627451,0.8196078431372549,0.7450980392156863,0.8431372549019608,0.7568627450980392,0.5450980392156862,0.12549019607843137,0.06666666666666667,0.5411764705882353,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9215686274509803,0.07058823529411765,0.07450980392156863,0.34901960784313724,0.6313725490196078,0.6431372549019608,0.7529411764705882,0.6352941176470588,0.615686274509804,0.26666666666666666,0.11764705882352941,0.06274509803921569,0.3803921568627451,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8313725490196079,0.050980392156862744,0.08627450980392157,0.03137254901960784,0.16470588235294117,0.3215686274509804,0.4666666666666667,0.2901960784313726,0.13725490196078433,0.027450980392156862,0.13333333333333333,0.0784313725490196,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7215686274509804,0.01568627450980392,0.09803921568627451,0.08627450980392157,0.050980392156862744,0.027450980392156862,0.0784313725490196,0.03137254901960784,0.058823529411764705,0.11372549019607843,0.11764705882352941,0.10588235294117647,0.06274509803921569,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.48627450980392156,0.01568627450980392,0.09411764705882353,0.09019607843137255,0.10588235294117647,0.10588235294117647,0.08235294117647059,0.11764705882352941,0.11764705882352941,0.10588235294117647,0.10588235294117647,0.12549019607843137,0.12156862745098039,0.9803921568627451,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2549019607843137,0.023529411764705882,0.09019607843137255,0.09019607843137255,0.09019607843137255,0.09411764705882353,0.09803921568627451,0.09411764705882353,0.10980392156862745,0.10588235294117647,0.09803921568627451,0.12549019607843137,0.06666666666666667,0.7607843137254902,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.027450980392156862,0.047058823529411764,0.09411764705882353,0.08235294117647059,0.09019607843137255,0.09019607843137255,0.09803921568627451,0.09803921568627451,0.10196078431372549,0.10196078431372549,0.10588235294117647,0.12549019607843137,0.054901960784313725,0.5294117647058824,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9725490196078431,0.0,0.0784313725490196,0.08235294117647059,0.08627450980392157,0.09019607843137255,0.09019607843137255,0.09803921568627451,0.09803921568627451,0.10196078431372549,0.10196078431372549,0.10588235294117647,0.11764705882352941,0.0784313725490196,0.24313725490196078,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.807843137254902,0.0,0.08627450980392157,0.08235294117647059,0.08627450980392157,0.09019607843137255,0.09019607843137255,0.09803921568627451,0.09803921568627451,0.10196078431372549,0.10196078431372549,0.10588235294117647,0.12156862745098039,0.09803921568627451,0.08235294117647059,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.6392156862745098,0.0,0.10196078431372549,0.0784313725490196,0.08627450980392157,0.08627450980392157,0.09019607843137255,0.09803921568627451,0.09803921568627451,0.10196078431372549,0.10196078431372549,0.10980392156862745,0.11372549019607843,0.10588235294117647,0.00392156862745098,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.5647058823529412,0.0,0.10196078431372549,0.08235294117647059,0.08627450980392157,0.08627450980392157,0.09019607843137255,0.09803921568627451,0.10196078431372549,0.10196078431372549,0.10196078431372549,0.10980392156862745,0.10980392156862745,0.11764705882352941,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.5058823529411764,0.0,0.10196078431372549,0.07450980392156863,0.08627450980392157,0.08235294117647059,0.08627450980392157,0.09019607843137255,0.09411764705882353,0.10196078431372549,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.12156862745098039,0.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.4392156862745098,0.03137254901960784,0.10196078431372549,0.10980392156862745,0.10980392156862745,0.12156862745098039,0.10980392156862745,0.12549019607843137,0.13333333333333333,0.12941176470588237,0.13333333333333333,0.13725490196078433,0.12549019607843137,0.13725490196078433,0.11764705882352941,0.9176470588235294,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.27450980392156865,0.0,0.047058823529411764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06666666666666667,0.08235294117647059,0.0,0.788235294117647,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9450980392156862,0.5529411764705883,0.5411764705882353,0.48627450980392156,0.4745098039215686,0.5137254901960784,0.48627450980392156,0.4666666666666667,0.5215686274509804,0.5372549019607843,0.5568627450980392,0.5686274509803921,0.5803921568627451,0.6235294117647059,0.792156862745098,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x12","yaxis":"y12","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"18","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7843137254901961,0.5137254901960784,0.5137254901960784,0.49019607843137253,0.5215686274509804,0.49411764705882355,0.4666666666666667,0.47058823529411764,0.5098039215686274,0.5019607843137255,0.8509803921568627,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.23921568627450981,0.0,0.023529411764705882,0.01568627450980392,0.00784313725490196,0.00784313725490196,0.0196078431372549,0.050980392156862744,0.06274509803921569,0.0,0.24705882352941178,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2784313725490196,0.13725490196078433,0.1607843137254902,0.17647058823529413,0.10588235294117647,0.19607843137254902,0.1450980392156863,0.17254901960784313,0.1568627450980392,0.12941176470588237,0.25882352941176473,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.21568627450980393,0.16470588235294117,0.1803921568627451,0.19215686274509805,0.10588235294117647,0.1411764705882353,0.11764705882352941,0.14901960784313725,0.13725490196078433,0.1607843137254902,0.25882352941176473,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2235294117647059,0.18823529411764706,0.16862745098039217,0.17647058823529413,0.12156862745098039,0.1607843137254902,0.1568627450980392,0.1607843137254902,0.17647058823529413,0.1803921568627451,0.23529411764705882,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.22745098039215686,0.20784313725490197,0.19607843137254902,0.21568627450980393,0.1411764705882353,0.1803921568627451,0.18823529411764706,0.17254901960784313,0.19607843137254902,0.18823529411764706,0.2196078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.22745098039215686,0.19215686274509805,0.19607843137254902,0.19607843137254902,0.13725490196078433,0.1568627450980392,0.17647058823529413,0.17254901960784313,0.19215686274509805,0.20392156862745098,0.25098039215686274,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.24313725490196078,0.2,0.19607843137254902,0.17254901960784313,0.15294117647058825,0.16862745098039217,0.20392156862745098,0.19607843137254902,0.20392156862745098,0.1803921568627451,0.2784313725490196,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3137254901960784,0.17254901960784313,0.2235294117647059,0.22745098039215686,0.1450980392156863,0.11764705882352941,0.23921568627450981,0.23137254901960785,0.2235294117647059,0.1843137254901961,0.3764705882352941,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4549019607843137,0.10980392156862745,0.23921568627450981,0.27450980392156865,0.16862745098039217,0.027450980392156862,0.19215686274509805,0.25882352941176473,0.23529411764705882,0.14901960784313725,0.5176470588235295,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5882352941176471,0.08235294117647059,0.23529411764705882,0.23529411764705882,0.1843137254901961,0.8705882352941177,0.12156862745098039,0.24705882352941178,0.2549019607843137,0.11764705882352941,0.6235294117647059,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6823529411764706,0.07058823529411765,0.2549019607843137,0.21568627450980393,0.17254901960784313,1.0,0.1607843137254902,0.16862745098039217,0.23529411764705882,0.09411764705882353,0.7019607843137254,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7803921568627451,0.06274509803921569,0.25098039215686274,0.19215686274509805,0.21568627450980393,1.0,0.15294117647058825,0.13725490196078433,0.23137254901960785,0.0784313725490196,0.8313725490196079,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8196078431372549,0.09411764705882353,0.27058823529411763,0.1803921568627451,0.25098039215686274,1.0,0.2196078431372549,0.12156862745098039,0.24705882352941178,0.10196078431372549,0.9098039215686274,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8431372549019608,0.10588235294117647,0.2784313725490196,0.17647058823529413,0.30980392156862746,1.0,0.30980392156862746,0.16862745098039217,0.2549019607843137,0.10588235294117647,0.9372549019607843,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8745098039215686,0.09019607843137255,0.2627450980392157,0.11764705882352941,0.4470588235294118,1.0,0.4392156862745098,0.15294117647058825,0.26666666666666666,0.10588235294117647,0.9176470588235294,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8980392156862745,0.08627450980392157,0.2627450980392157,0.0784313725490196,0.5529411764705883,1.0,0.5294117647058824,0.1450980392156863,0.2784313725490196,0.10980392156862745,0.8862745098039215,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8784313725490196,0.08235294117647059,0.27058823529411763,0.07058823529411765,0.5411764705882353,1.0,0.5372549019607843,0.1411764705882353,0.2823529411764706,0.10588235294117647,0.8823529411764706,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8549019607843137,0.08235294117647059,0.2784313725490196,0.1803921568627451,0.5372549019607843,1.0,0.5215686274509804,0.13333333333333333,0.28627450980392155,0.11372549019607843,0.8823529411764706,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8862745098039215,0.058823529411764705,0.25098039215686274,0.16862745098039217,0.5176470588235295,1.0,0.5411764705882353,0.13333333333333333,0.27058823529411763,0.12156862745098039,0.8901960784313725,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9725490196078431,0.043137254901960784,0.2235294117647059,0.15294117647058825,0.5215686274509804,1.0,0.5725490196078431,0.13333333333333333,0.2627450980392157,0.12156862745098039,0.9647058823529412,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.07058823529411765,0.20784313725490197,0.1411764705882353,0.5725490196078431,1.0,0.6078431372549019,0.12941176470588237,0.22745098039215686,0.09803921568627451,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.09803921568627451,0.2,0.13333333333333333,0.6392156862745098,1.0,0.6745098039215687,0.11764705882352941,0.19607843137254902,0.09411764705882353,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.08627450980392157,0.18823529411764706,0.13333333333333333,0.6862745098039216,1.0,0.7058823529411765,0.12156862745098039,0.2,0.09803921568627451,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.08235294117647059,0.17647058823529413,0.13725490196078433,0.7019607843137254,1.0,0.7450980392156863,0.12156862745098039,0.18823529411764706,0.09411764705882353,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.21568627450980393,0.19607843137254902,0.13725490196078433,0.6980392156862745,1.0,0.7254901960784313,0.11372549019607843,0.19215686274509805,0.22745098039215686,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.0,0.11372549019607843,0.07058823529411765,0.6627450980392157,1.0,0.6549019607843137,0.050980392156862744,0.10980392156862745,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7843137254901961,0.47058823529411764,0.6470588235294118,1.0,1.0,1.0,0.611764705882353,0.4627450980392157,0.7686274509803922,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x","yaxis":"y","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"19","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.4588235294117647,0.4823529411764706,0.5529411764705883,0.4745098039215686,0.6745098039215687,0.592156862745098,0.5607843137254902,0.611764705882353,0.6509803921568628,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.792156862745098,0.0,0.12941176470588237,0.10196078431372549,0.027450980392156862,0.2196078431372549,0.21176470588235294,0.08627450980392157,0.26666666666666666,0.2,0.7647058823529411,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.7647058823529411,0.13333333333333333,0.20784313725490197,0.2784313725490196,0.1803921568627451,0.10588235294117647,0.2784313725490196,0.2549019607843137,0.32941176470588235,0.2627450980392157,0.7686274509803922,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.6352941176470588,0.10588235294117647,0.21568627450980393,0.20784313725490197,0.16862745098039217,0.19215686274509805,0.3058823529411765,0.27450980392156865,0.2901960784313726,0.21176470588235294,0.6039215686274509,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5333333333333333,0.1411764705882353,0.23921568627450981,0.21176470588235294,0.21568627450980393,0.23921568627450981,0.3176470588235294,0.27058823529411763,0.2980392156862745,0.2627450980392157,0.5568627450980392,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2823529411764706,0.1607843137254902,0.22745098039215686,0.20784313725490197,0.21176470588235294,0.19215686274509805,0.2549019607843137,0.2549019607843137,0.2823529411764706,0.2823529411764706,0.5607843137254902,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.12156862745098039,0.20392156862745098,0.23529411764705882,0.20784313725490197,0.21176470588235294,0.2,0.2823529411764706,0.25098039215686274,0.2235294117647059,0.23921568627450981,0.5215686274509804,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9294117647058824,0.054901960784313725,0.2196078431372549,0.21568627450980393,0.21176470588235294,0.21176470588235294,0.14901960784313725,0.23921568627450981,0.19215686274509805,0.25098039215686274,0.2549019607843137,0.4823529411764706,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7607843137254902,0.1411764705882353,0.23529411764705882,0.24705882352941178,0.30980392156862746,0.21176470588235294,0.047058823529411764,0.0784313725490196,0.10196078431372549,0.21568627450980393,0.23137254901960785,0.6039215686274509,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.592156862745098,0.1411764705882353,0.24705882352941178,0.27058823529411763,0.25882352941176473,0.3607843137254902,0.6627450980392157,0.0,0.14901960784313725,0.22745098039215686,0.1803921568627451,0.6666666666666666,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.40784313725490196,0.13333333333333333,0.2549019607843137,0.2901960784313726,0.13725490196078433,0.7764705882352941,1.0,0.0,0.13725490196078433,0.21176470588235294,0.17647058823529413,0.6549019607843137,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2823529411764706,0.1450980392156863,0.25098039215686274,0.2549019607843137,0.23921568627450981,1.0,1.0,0.03529411764705882,0.10588235294117647,0.2196078431372549,0.16470588235294117,0.6549019607843137,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.25882352941176473,0.15294117647058825,0.25098039215686274,0.20784313725490197,0.45098039215686275,1.0,1.0,0.22745098039215686,0.08627450980392157,0.22745098039215686,0.13725490196078433,0.7137254901960784,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.23921568627450981,0.1568627450980392,0.2549019607843137,0.1803921568627451,0.4666666666666667,1.0,1.0,0.2901960784313726,0.09019607843137255,0.2235294117647059,0.10980392156862745,0.7529411764705882,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.13725490196078433,0.16470588235294117,0.24705882352941178,0.13725490196078433,0.7294117647058823,1.0,1.0,0.36470588235294116,0.09803921568627451,0.21176470588235294,0.12156862745098039,0.9411764705882353,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.027450980392156862,0.19607843137254902,0.23921568627450981,0.08627450980392157,0.596078431372549,1.0,1.0,0.3254901960784314,0.08627450980392157,0.23137254901960785,0.27058823529411763,0.9725490196078431,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.16470588235294117,0.1843137254901961,0.23921568627450981,0.11764705882352941,0.8823529411764706,1.0,1.0,0.2627450980392157,0.07058823529411765,0.1843137254901961,0.16470588235294117,0.7843137254901961,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.27450980392156865,0.16862745098039217,0.24705882352941178,0.058823529411764705,0.8313725490196079,1.0,1.0,0.8235294117647058,0.09803921568627451,0.19215686274509805,0.1607843137254902,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4823529411764706,0.1607843137254902,0.23921568627450981,0.19607843137254902,0.6823529411764706,1.0,1.0,0.5294117647058824,0.054901960784313725,0.1843137254901961,0.15294117647058825,0.9215686274509803,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7411764705882353,0.12549019607843137,0.23137254901960785,0.19215686274509805,0.5686274509803921,1.0,1.0,0.37254901960784315,0.08627450980392157,0.1843137254901961,0.19215686274509805,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0784313725490196,0.20392156862745098,0.20392156862745098,0.34509803921568627,1.0,1.0,0.4823529411764706,0.08627450980392157,0.17647058823529413,0.1803921568627451,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.28627450980392155,0.1803921568627451,0.22745098039215686,0.22745098039215686,1.0,1.0,0.596078431372549,0.09019607843137255,0.1803921568627451,0.16862745098039217,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.49411764705882355,0.1450980392156863,0.19607843137254902,0.1803921568627451,1.0,1.0,0.5490196078431373,0.09803921568627451,0.20784313725490197,0.19607843137254902,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.7568627450980392,0.12156862745098039,0.2235294117647059,0.027450980392156862,0.8274509803921568,1.0,0.6,0.07450980392156863,0.2196078431372549,0.1803921568627451,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.9725490196078431,0.16862745098039217,0.20392156862745098,0.17647058823529413,0.9058823529411765,1.0,0.7490196078431373,0.08627450980392157,0.17254901960784313,0.16862745098039217,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.3215686274509804,0.19215686274509805,0.18823529411764706,0.796078431372549,1.0,0.6980392156862745,0.11372549019607843,0.1843137254901961,0.15294117647058825,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.403921568627451,0.10980392156862745,0.17647058823529413,0.3843137254901961,1.0,0.6549019607843137,0.043137254901960784,0.1411764705882353,0.10588235294117647,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9764705882352941,1.0,0.8549019607843137,0.42745098039215684,0.7098039215686275,1.0,1.0,0.8901960784313725,0.3058823529411765,0.4117647058823529,0.5176470588235295,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x2","yaxis":"y2","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,0.15],"scaleanchor":"y","constrain":"domain"},"yaxis":{"anchor":"x","domain":[0.0,0.1975],"autorange":"reversed","constrain":"domain"},"xaxis2":{"anchor":"y2","domain":[0.16999999999999998,0.31999999999999995],"matches":"x"},"yaxis2":{"anchor":"x2","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis3":{"anchor":"y3","domain":[0.33999999999999997,0.49],"matches":"x"},"yaxis3":{"anchor":"x3","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis4":{"anchor":"y4","domain":[0.51,0.66],"matches":"x"},"yaxis4":{"anchor":"x4","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis5":{"anchor":"y5","domain":[0.6799999999999999,0.83],"matches":"x"},"yaxis5":{"anchor":"x5","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis6":{"anchor":"y6","domain":[0.85,1.0],"matches":"x"},"yaxis6":{"anchor":"x6","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis7":{"anchor":"y7","domain":[0.0,0.15],"matches":"x","showticklabels":false},"yaxis7":{"anchor":"x7","domain":[0.2675,0.465],"matches":"y"},"xaxis8":{"anchor":"y8","domain":[0.16999999999999998,0.31999999999999995],"matches":"x","showticklabels":false},"yaxis8":{"anchor":"x8","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis9":{"anchor":"y9","domain":[0.33999999999999997,0.49],"matches":"x","showticklabels":false},"yaxis9":{"anchor":"x9","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis10":{"anchor":"y10","domain":[0.51,0.66],"matches":"x","showticklabels":false},"yaxis10":{"anchor":"x10","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis11":{"anchor":"y11","domain":[0.6799999999999999,0.83],"matches":"x","showticklabels":false},"yaxis11":{"anchor":"x11","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis12":{"anchor":"y12","domain":[0.85,1.0],"matches":"x","showticklabels":false},"yaxis12":{"anchor":"x12","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis13":{"anchor":"y13","domain":[0.0,0.15],"matches":"x","showticklabels":false},"yaxis13":{"anchor":"x13","domain":[0.535,0.7325],"matches":"y"},"xaxis14":{"anchor":"y14","domain":[0.16999999999999998,0.31999999999999995],"matches":"x","showticklabels":false},"yaxis14":{"anchor":"x14","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis15":{"anchor":"y15","domain":[0.33999999999999997,0.49],"matches":"x","showticklabels":false},"yaxis15":{"anchor":"x15","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis16":{"anchor":"y16","domain":[0.51,0.66],"matches":"x","showticklabels":false},"yaxis16":{"anchor":"x16","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis17":{"anchor":"y17","domain":[0.6799999999999999,0.83],"matches":"x","showticklabels":false},"yaxis17":{"anchor":"x17","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis18":{"anchor":"y18","domain":[0.85,1.0],"matches":"x","showticklabels":false},"yaxis18":{"anchor":"x18","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis19":{"anchor":"y19","domain":[0.0,0.15],"matches":"x","showticklabels":false},"yaxis19":{"anchor":"x19","domain":[0.8025,1.0],"matches":"y"},"xaxis20":{"anchor":"y20","domain":[0.16999999999999998,0.31999999999999995],"matches":"x","showticklabels":false},"yaxis20":{"anchor":"x20","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"xaxis21":{"anchor":"y21","domain":[0.33999999999999997,0.49],"matches":"x","showticklabels":false},"yaxis21":{"anchor":"x21","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"xaxis22":{"anchor":"y22","domain":[0.51,0.66],"matches":"x","showticklabels":false},"yaxis22":{"anchor":"x22","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"xaxis23":{"anchor":"y23","domain":[0.6799999999999999,0.83],"matches":"x","showticklabels":false},"yaxis23":{"anchor":"x23","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"xaxis24":{"anchor":"y24","domain":[0.85,1.0],"matches":"x","showticklabels":false},"yaxis24":{"anchor":"x24","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"annotations":[{"font":{},"showarrow":false,"text":"Trouser","x":0.075,"xanchor":"center","xref":"paper","y":0.1975,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Trouser","x":0.24499999999999997,"xanchor":"center","xref":"paper","y":0.1975,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Shirt","x":0.075,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Shirt","x":0.24499999999999997,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Sneaker","x":0.415,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Sneaker","x":0.585,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"T-shirt\u002ftop","x":0.7549999999999999,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"T-shirt\u002ftop","x":0.925,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Dress","x":0.075,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Dress","x":0.24499999999999997,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Pullover","x":0.415,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Pullover","x":0.585,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Sandal","x":0.7549999999999999,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Sandal","x":0.925,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Ankle boot","x":0.075,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Ankle boot","x":0.24499999999999997,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Bag","x":0.415,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Bag","x":0.585,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Coat","x":0.7549999999999999,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Coat","x":0.925,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"coloraxis":{"colorscale":[[0.0,"rgb(0, 0, 0)"],[0.09090909090909091,"rgb(16, 16, 16)"],[0.18181818181818182,"rgb(38, 38, 38)"],[0.2727272727272727,"rgb(59, 59, 59)"],[0.36363636363636365,"rgb(81, 80, 80)"],[0.45454545454545453,"rgb(102, 101, 101)"],[0.5454545454545454,"rgb(124, 123, 122)"],[0.6363636363636364,"rgb(146, 146, 145)"],[0.7272727272727273,"rgb(171, 171, 170)"],[0.8181818181818182,"rgb(197, 197, 195)"],[0.9090909090909091,"rgb(224, 224, 223)"],[1.0,"rgb(254, 254, 253)"]],"showscale":false},"height":880},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('e9b2838d-650f-455a-8915-9b8a4f9202f0');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>Let’s break this down further and look at it by class, or the category of clothing:</p>
+<div id="0b3e1f17" class="cell" data-execution_count="26">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(class_dict)</span>
+<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a>show_images(images.groupby(<span class="st">'class'</span>,as_index<span class="op">=</span><span class="va">False</span>).sample(<span class="dv">2</span>), ncols<span class="op">=</span><span class="dv">6</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>{0: 'T-shirt/top', 1: 'Trouser', 2: 'Pullover', 3: 'Dress', 4: 'Coat', 5: 'Sandal', 6: 'Shirt', 7: 'Sneaker', 8: 'Bag', 9: 'Ankle boot'}</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>                            <div id="a19f4cd8-6d12-42a3-961e-bbb31c499978" class="plotly-graph-div" style="height:880px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("a19f4cd8-6d12-42a3-961e-bbb31c499978")) {                    Plotly.newPlot(                        "a19f4cd8-6d12-42a3-961e-bbb31c499978",                        [{"coloraxis":"coloraxis","name":"0","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9254901960784314,0.8470588235294118,0.8431372549019608,0.6823529411764706,0.7725490196078432,0.48627450980392156,0.6509803921568628,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9882352941176471,1.0,0.6705882352941176,0.5294117647058824,0.6392156862745098,0.5058823529411764,0.6392156862745098,0.5411764705882353,0.6039215686274509,0.9882352941176471,0.4235294117647059,0.6941176470588235,0.8470588235294118,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.7490196078431373,0.5411764705882353,0.7411764705882353,0.7058823529411765,0.6392156862745098,0.36470588235294116,0.1450980392156863,0.3254901960784314,0.38823529411764707,0.5215686274509804,0.47058823529411764,0.5333333333333333,0.615686274509804,1.0,0.996078431372549],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9882352941176471,1.0,0.8117647058823529,0.4980392156862745,0.6392156862745098,0.5450980392156862,0.6,0.28627450980392155,0.03137254901960784,0.0,0.4235294117647059,0.49411764705882355,0.5803921568627451,0.5058823529411764,0.5764705882352941,1.0,0.996078431372549],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8823529411764706,0.4980392156862745,0.6039215686274509,0.5333333333333333,0.5098039215686274,0.5333333333333333,0.5529411764705883,0.3333333333333333,0.6,0.5058823529411764,0.5411764705882353,0.39215686274509803,0.8352941176470589,1.0,0.9882352941176471],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9176470588235294,0.5333333333333333,0.6509803921568628,0.5882352941176471,0.592156862745098,0.5215686274509804,0.4745098039215686,0.592156862745098,0.47058823529411764,0.45098039215686275,0.592156862745098,0.4588235294117647,0.8470588235294118,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9607843137254902,0.6039215686274509,0.6901960784313725,0.5568627450980392,0.6039215686274509,0.4117647058823529,0.4823529411764706,0.5411764705882353,0.5450980392156862,0.4823529411764706,0.6352941176470588,0.38823529411764707,0.6901960784313725,1.0,1.0],[0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.996078431372549,1.0,0.9254901960784314,0.5450980392156862,0.6470588235294118,0.5176470588235295,0.4823529411764706,0.3568627450980392,0.5411764705882353,0.5333333333333333,0.47058823529411764,0.4627450980392157,0.5686274509803921,0.3215686274509804,0.6470588235294118,1.0,1.0],[0.996078431372549,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,1.0,0.984313725490196,0.7058823529411765,0.5098039215686274,0.6588235294117647,0.6235294117647059,0.5098039215686274,0.4588235294117647,0.4627450980392157,0.4745098039215686,0.4588235294117647,0.43529411764705883,0.43529411764705883,0.34509803921568627,0.5176470588235295,1.0,0.996078431372549],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,0.996078431372549,1.0,0.9647058823529412,0.7058823529411765,0.6588235294117647,0.5411764705882353,0.5568627450980392,0.5294117647058824,0.3568627450980392,0.38823529411764707,0.4,0.45098039215686275,0.4745098039215686,0.4,0.3568627450980392,0.34509803921568627,0.27450980392156865,0.9647058823529412,1.0],[1.0,1.0,1.0,0.9882352941176471,0.996078431372549,0.996078431372549,0.996078431372549,1.0,1.0,1.0,0.9137254901960784,0.6588235294117647,0.6705882352941176,0.7176470588235294,0.4980392156862745,0.49411764705882355,0.5058823529411764,0.42745098039215684,0.45098039215686275,0.4235294117647059,0.3764705882352941,0.30196078431372547,0.4117647058823529,0.3568627450980392,0.43529411764705883,0.25098039215686274,0.7254901960784313,1.0],[0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9372549019607843,0.796078431372549,0.6509803921568628,0.6039215686274509,0.6470588235294118,0.6509803921568628,0.592156862745098,0.5764705882352941,0.4823529411764706,0.4980392156862745,0.5058823529411764,0.4235294117647059,0.4,0.403921568627451,0.41568627450980394,0.45098039215686275,0.4392156862745098,0.3686274509803922,0.41568627450980394,1.0],[1.0,1.0,0.9725490196078431,0.9137254901960784,0.8901960784313725,0.8705882352941177,0.7725490196078432,0.6509803921568628,0.6470588235294118,0.6784313725490196,0.6235294117647059,0.6352941176470588,0.6901960784313725,0.615686274509804,0.5882352941176471,0.5764705882352941,0.5176470588235295,0.49411764705882355,0.4588235294117647,0.43529411764705883,0.4588235294117647,0.4588235294117647,0.4470588235294118,0.4392156862745098,0.35294117647058826,0.3803921568627451,0.3764705882352941,0.996078431372549],[1.0,0.8784313725490196,0.5647058823529412,0.5803921568627451,0.6039215686274509,0.6039215686274509,0.6392156862745098,0.6588235294117647,0.6,0.5803921568627451,0.5764705882352941,0.615686274509804,0.6470588235294118,0.6235294117647059,0.5686274509803921,0.4745098039215686,0.48627450980392156,0.5058823529411764,0.4588235294117647,0.4392156862745098,0.4745098039215686,0.48627450980392156,0.4745098039215686,0.42745098039215684,0.39215686274509803,0.38823529411764707,0.27450980392156865,0.9882352941176471],[1.0,0.5176470588235295,0.5058823529411764,0.592156862745098,0.6784313725490196,0.6784313725490196,0.6705882352941176,0.6588235294117647,0.6627450980392157,0.7372549019607844,0.7019607843137254,0.6784313725490196,0.6274509803921569,0.5882352941176471,0.6,0.47058823529411764,0.43529411764705883,0.3803921568627451,0.30980392156862746,0.23137254901960785,0.17254901960784313,0.1803921568627451,0.22745098039215686,0.2196078431372549,0.2196078431372549,0.2627450980392157,0.3333333333333333,0.9725490196078431],[0.9764705882352941,0.48627450980392156,0.36470588235294116,0.3764705882352941,0.4,0.3333333333333333,0.3686274509803922,0.4235294117647059,0.4627450980392157,0.4823529411764706,0.4588235294117647,0.42745098039215684,0.38823529411764707,0.26666666666666666,0.17254901960784313,0.22745098039215686,0.11372549019607843,0.20784313725490197,0.27450980392156865,0.38823529411764707,0.5176470588235295,0.6039215686274509,0.6823529411764706,0.7294117647058823,0.7725490196078432,0.8313725490196079,0.8470588235294118,0.996078431372549],[0.8470588235294118,0.6784313725490196,0.7254901960784313,0.6509803921568628,0.5450980392156862,0.5411764705882353,0.5450980392156862,0.5568627450980392,0.5647058823529412,0.5647058823529412,0.4980392156862745,0.4823529411764706,0.4588235294117647,0.4823529411764706,0.5764705882352941,0.7019607843137254,0.8196078431372549,0.8666666666666667,0.8705882352941177,0.8823529411764706,0.8352941176470589,0.8313725490196079,0.8117647058823529,0.7490196078431373,0.6823529411764706,0.6509803921568628,0.48627450980392156,0.7294117647058823],[1.0,0.9764705882352941,0.8666666666666667,0.8784313725490196,0.8,0.7529411764705882,0.7019607843137254,0.6470588235294118,0.615686274509804,0.6235294117647059,0.6352941176470588,0.6784313725490196,0.6509803921568628,0.8196078431372549,0.8784313725490196,0.8196078431372549,0.788235294117647,0.7411764705882353,0.7176470588235294,0.7647058823529411,0.6274509803921569,0.5098039215686274,0.5529411764705883,0.5647058823529412,0.6274509803921569,0.6705882352941176,0.7764705882352941,0.996078431372549],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x19","yaxis":"y19","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"1","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.996078431372549,0.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8901960784313725,0.19215686274509805,0.8117647058823529,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.3215686274509804,0.18823529411764706,0.12549019607843137,0.592156862745098,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.8784313725490196,0.12549019607843137,0.35294117647058826,0.29411764705882354,0.1568627450980392,0.6980392156862745,1.0,1.0,0.996078431372549,1.0,0.9294117647058824,0.9294117647058824,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.27450980392156865,0.1568627450980392,0.19215686274509805,0.26666666666666666,0.17647058823529413,0.11764705882352941,0.23529411764705882,1.0,1.0,1.0,0.8705882352941177,0.6078431372549019,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.19215686274509805,0.17647058823529413,0.11372549019607843,0.18823529411764706,0.18823529411764706,0.19607843137254902,0.058823529411764705,0.1450980392156863,0.796078431372549,0.996078431372549,0.7215686274509804,0.7215686274509804,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.6705882352941176,0.1450980392156863,0.3137254901960784,0.4588235294117647,0.1843137254901961,0.1607843137254902,0.15294117647058825,0.14901960784313725,0.1450980392156863,0.13725490196078433,0.1568627450980392,0.30196078431372547,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7254901960784313,0.023529411764705882,0.09411764705882353,0.050980392156862744,0.16470588235294117,0.1607843137254902,0.14901960784313725,0.17254901960784313,0.11764705882352941,0.1607843137254902,0.17647058823529413,0.14901960784313725,0.33725490196078434,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,1.0,1.0,1.0,0.9921568627450981,1.0,0.6431372549019608,0.4627450980392157,0.0,0.4235294117647059,0.49411764705882355,0.1607843137254902,0.2901960784313726,0.22745098039215686,0.1568627450980392,0.12549019607843137,0.16862745098039217,0.12156862745098039,0.14901960784313725,0.13725490196078433,0.1843137254901961,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9607843137254902,0.09019607843137255,0.2235294117647059,0.3215686274509804,0.1607843137254902,0.21568627450980393,0.1843137254901961,0.23137254901960785,0.23137254901960785,0.25882352941176473,0.1843137254901961,0.12549019607843137,0.1411764705882353,0.14901960784313725,0.1803921568627451,0.12549019607843137,0.8588235294117647,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.16862745098039217,0.22745098039215686,0.12156862745098039,0.2980392156862745,0.4196078431372549,0.0196078431372549,0.24705882352941178,0.2,0.15294117647058825,0.1568627450980392,0.1607843137254902,0.20392156862745098,0.18823529411764706,0.17254901960784313,0.16470588235294117,0.18823529411764706,0.13725490196078433,0.43137254901960786,1.0],[1.0,0.984313725490196,0.9764705882352941,1.0,0.996078431372549,1.0,1.0,0.9058823529411765,0.25098039215686274,0.0196078431372549,0.38823529411764707,0.2980392156862745,0.0,0.13333333333333333,0.26666666666666666,0.25882352941176473,0.20784313725490197,0.1607843137254902,0.12549019607843137,0.16470588235294117,0.13725490196078433,0.14901960784313725,0.16862745098039217,0.15294117647058825,0.18823529411764706,0.16470588235294117,0.03137254901960784,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.30980392156862746,0.25098039215686274,0.08627450980392157,0.11764705882352941,0.30980392156862746,0.33725490196078434,0.18823529411764706,0.16862745098039217,0.2235294117647059,0.4627450980392157,0.09803921568627451,0.15294117647058825,0.16470588235294117,0.16470588235294117,0.16470588235294117,0.1411764705882353,0.13725490196078433,0.17647058823529413,0.14901960784313725,0.788235294117647],[1.0,0.8745098039215686,0.4980392156862745,0.5411764705882353,0.5176470588235295,0.3843137254901961,0.21176470588235294,0.1843137254901961,0.2235294117647059,0.19215686274509805,0.30196078431372547,0.2901960784313726,0.2,0.23921568627450981,0.22745098039215686,0.18823529411764706,0.10588235294117647,0.30196078431372547,0.4470588235294118,0.09411764705882353,0.1803921568627451,0.1568627450980392,0.12941176470588237,0.1568627450980392,0.1607843137254902,0.1803921568627451,0.12941176470588237,0.6980392156862745],[0.9294117647058824,0.23529411764705882,0.17647058823529413,0.23921568627450981,0.24313725490196078,0.28627450980392155,0.3411764705882353,0.396078431372549,0.3137254901960784,0.2823529411764706,0.27450980392156865,0.25882352941176473,0.21568627450980393,0.17647058823529413,0.20784313725490197,0.1607843137254902,0.15294117647058825,0.14901960784313725,0.2784313725490196,0.1607843137254902,0.17647058823529413,0.16470588235294117,0.1568627450980392,0.14901960784313725,0.13333333333333333,0.16470588235294117,0.0,0.7686274509803922],[0.7019607843137254,0.27058823529411763,0.2823529411764706,0.19215686274509805,0.1803921568627451,0.18823529411764706,0.23529411764705882,0.28627450980392155,0.3176470588235294,0.2980392156862745,0.2196078431372549,0.26666666666666666,0.23921568627450981,0.16470588235294117,0.1607843137254902,0.17254901960784313,0.14901960784313725,0.17647058823529413,0.14901960784313725,0.15294117647058825,0.1803921568627451,0.13333333333333333,0.13725490196078433,0.1411764705882353,0.1607843137254902,0.2,0.11764705882352941,0.9450980392156862],[0.6745098039215687,0.29411764705882354,0.3411764705882353,0.30980392156862746,0.25882352941176473,0.22745098039215686,0.11372549019607843,0.1450980392156863,0.24705882352941178,0.23529411764705882,0.2235294117647059,0.16862745098039217,0.1803921568627451,0.1568627450980392,0.19607843137254902,0.19215686274509805,0.16470588235294117,0.20392156862745098,0.19607843137254902,0.18823529411764706,0.13333333333333333,0.2,0.23529411764705882,0.23137254901960785,0.25098039215686274,0.25882352941176473,0.2235294117647059,1.0],[0.8823529411764706,0.10196078431372549,0.23529411764705882,0.2549019607843137,0.2823529411764706,0.3176470588235294,0.2235294117647059,0.16470588235294117,0.15294117647058825,0.17647058823529413,0.18823529411764706,0.17647058823529413,0.1843137254901961,0.17254901960784313,0.1607843137254902,0.1607843137254902,0.15294117647058825,0.1568627450980392,0.12941176470588237,0.13333333333333333,0.2235294117647059,0.27058823529411763,0.26666666666666666,0.25882352941176473,0.25882352941176473,0.24705882352941178,0.2627450980392157,0.9529411764705882],[1.0,0.9725490196078431,0.5294117647058824,0.3176470588235294,0.25098039215686274,0.20392156862745098,0.24313725490196078,0.13725490196078433,0.10196078431372549,0.11764705882352941,0.12549019607843137,0.10980392156862745,0.09411764705882353,0.10980392156862745,0.09019607843137255,0.10980392156862745,0.10980392156862745,0.11372549019607843,0.11764705882352941,0.14901960784313725,0.23921568627450981,0.21568627450980393,0.21568627450980393,0.19607843137254902,0.1843137254901961,0.1568627450980392,0.23921568627450981,0.9254901960784314],[1.0,1.0,1.0,1.0,0.7725490196078432,0.6705882352941176,0.3843137254901961,0.3607843137254902,0.3333333333333333,0.1568627450980392,0.3137254901960784,0.15294117647058825,0.3686274509803922,0.19607843137254902,0.3764705882352941,0.30196078431372547,0.3176470588235294,0.36470588235294116,0.36470588235294116,0.42745098039215684,0.4392156862745098,0.3333333333333333,0.4588235294117647,0.4117647058823529,0.49019607843137253,0.5725490196078431,0.8745098039215686,1.0],[1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x20","yaxis":"y20","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"2","z":[[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.996078431372549,1.0,1.0,1.0,1.0,0.49411764705882355,0.2823529411764706,0.27450980392156865,0.23529411764705882,0.3843137254901961,0.8862745098039215,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.984313725490196,1.0,1.0,0.7764705882352941,0.21176470588235294,0.20392156862745098,0.8352941176470589,1.0,0.9607843137254902,0.5019607843137255,0.0,0.39215686274509803,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,1.0,1.0,0.7019607843137254,0.12941176470588237,0.4980392156862745,1.0,1.0,1.0,1.0,1.0,1.0,0.24705882352941178,0.44313725490196076,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9882352941176471,1.0,0.9411764705882353,0.17647058823529413,0.3803921568627451,1.0,1.0,0.9921568627450981,1.0,1.0,0.996078431372549,1.0,1.0,0.17647058823529413,0.6823529411764706,1.0,0.9764705882352941,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.42745098039215684,0.09411764705882353,0.9254901960784314,1.0,0.996078431372549,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.6470588235294118,0.22745098039215686,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.9450980392156862,0.1803921568627451,0.3215686274509804,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.19607843137254902,0.7176470588235294,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.6588235294117647,0.09019607843137255,0.6196078431372549,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.41568627450980394,0.4392156862745098,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.47058823529411764,0.13725490196078433,0.8941176470588236,1.0,0.9882352941176471,0.996078431372549,1.0,1.0,1.0,1.0,0.996078431372549,0.9882352941176471,1.0,0.615686274509804,0.2980392156862745,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.43529411764705883,0.18823529411764706,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.7686274509803922,0.26666666666666666,1.0,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.32941176470588235,0.27058823529411763,1.0,1.0,0.9921568627450981,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8627450980392157,0.2235294117647059,0.9921568627450981,1.0,0.9921568627450981,1.0,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.984313725490196,1.0,1.0,0.2823529411764706,0.21176470588235294,0.8431372549019608,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5058823529411764,0.11764705882352941,0.7254901960784313,1.0,0.9803921568627451,0.9882352941176471,0.996078431372549,1.0],[1.0,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,0.6392156862745098,0.1843137254901961,0.4823529411764706,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2627450980392157,0.23921568627450981,0.9764705882352941,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.9725490196078431,0.8235294117647058,0.7568627450980392,0.796078431372549,0.5647058823529412,0.07058823529411765,0.34509803921568627,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8862745098039215,0.11372549019607843,0.2549019607843137,1.0,1.0,0.9647058823529412,0.9098039215686274,1.0,1.0],[1.0,1.0,1.0,0.32941176470588235,0.2,0.09019607843137255,0.043137254901960784,0.12156862745098039,0.27058823529411763,0.2784313725490196,0.26666666666666666,0.2235294117647059,0.28627450980392155,0.30196078431372547,0.2901960784313726,0.3176470588235294,0.3215686274509804,0.28627450980392155,0.28627450980392155,0.3058823529411765,0.3568627450980392,0.2549019607843137,0.25098039215686274,0.23137254901960785,0.2,0.0196078431372549,0.8,1.0],[1.0,1.0,0.8,0.12549019607843137,0.25882352941176473,0.23137254901960785,0.36470588235294116,0.2,0.19607843137254902,0.1803921568627451,0.23921568627450981,0.32941176470588235,0.36470588235294116,0.4,0.34901960784313724,0.32941176470588235,0.3411764705882353,0.3686274509803922,0.2549019607843137,0.24313725490196078,0.08235294117647059,0.13725490196078433,0.2,0.3176470588235294,0.2980392156862745,0.21176470588235294,0.36470588235294116,1.0],[1.0,1.0,0.8627450980392157,0.25098039215686274,0.1568627450980392,0.19607843137254902,0.22745098039215686,0.23921568627450981,0.3215686274509804,0.3254901960784314,0.21176470588235294,0.34901960784313724,0.3137254901960784,0.33725490196078434,0.4,0.4235294117647059,0.39215686274509803,0.396078431372549,0.30980392156862746,0.2235294117647059,0.3254901960784314,0.41568627450980394,0.30980392156862746,0.21176470588235294,0.27058823529411763,0.21568627450980393,0.6705882352941176,1.0],[1.0,1.0,0.9372549019607843,0.7803921568627451,0.0196078431372549,0.18823529411764706,0.2235294117647059,0.2235294117647059,0.2784313725490196,0.2549019607843137,0.2,0.29411764705882354,0.2823529411764706,0.29411764705882354,0.27058823529411763,0.26666666666666666,0.2627450980392157,0.23137254901960785,0.23921568627450981,0.21176470588235294,0.19607843137254902,0.33725490196078434,0.26666666666666666,0.2784313725490196,0.21568627450980393,0.28627450980392155,1.0,1.0],[1.0,1.0,0.48627450980392156,1.0,0.1803921568627451,0.17254901960784313,0.23921568627450981,0.17254901960784313,0.1450980392156863,0.21568627450980393,0.23921568627450981,0.26666666666666666,0.24313725490196078,0.23529411764705882,0.2,0.2,0.20784313725490197,0.20784313725490197,0.2196078431372549,0.2,0.1607843137254902,0.1568627450980392,0.29411764705882354,0.33725490196078434,0.1411764705882353,0.5098039215686274,1.0,1.0],[1.0,0.9137254901960784,0.06666666666666667,1.0,0.2196078431372549,0.11372549019607843,0.2196078431372549,0.19607843137254902,0.20784313725490197,0.20784313725490197,0.21568627450980393,0.22745098039215686,0.21176470588235294,0.2235294117647059,0.2196078431372549,0.22745098039215686,0.23921568627450981,0.2196078431372549,0.20784313725490197,0.22745098039215686,0.21568627450980393,0.16862745098039217,0.23137254901960785,0.30196078431372547,0.08627450980392157,0.7215686274509804,1.0,1.0],[1.0,0.8901960784313725,0.0,0.9686274509803922,0.40784313725490196,0.054901960784313725,0.23529411764705882,0.20784313725490197,0.23137254901960785,0.20392156862745098,0.2196078431372549,0.20392156862745098,0.1843137254901961,0.1803921568627451,0.18823529411764706,0.2196078431372549,0.20784313725490197,0.19215686274509805,0.1843137254901961,0.1843137254901961,0.21568627450980393,0.19215686274509805,0.19607843137254902,0.24705882352941178,0.07450980392156863,0.8313725490196079,1.0,1.0],[1.0,1.0,0.0,0.9137254901960784,0.4980392156862745,0.03137254901960784,0.2549019607843137,0.2,0.21176470588235294,0.19215686274509805,0.2,0.20784313725490197,0.18823529411764706,0.20784313725490197,0.2196078431372549,0.19607843137254902,0.19215686274509805,0.1803921568627451,0.18823529411764706,0.1803921568627451,0.19607843137254902,0.2196078431372549,0.20392156862745098,0.2196078431372549,0.10588235294117647,0.9803921568627451,1.0,1.0],[1.0,1.0,0.0,0.9137254901960784,0.5333333333333333,0.011764705882352941,0.2549019607843137,0.19607843137254902,0.20784313725490197,0.19607843137254902,0.2,0.20392156862745098,0.19215686274509805,0.21176470588235294,0.20784313725490197,0.17254901960784313,0.19607843137254902,0.18823529411764706,0.19215686274509805,0.20784313725490197,0.20392156862745098,0.21568627450980393,0.2235294117647059,0.20784313725490197,0.16862745098039217,1.0,1.0,1.0],[1.0,1.0,0.24313725490196078,0.6274509803921569,0.7803921568627451,0.0,0.2549019607843137,0.1843137254901961,0.19607843137254902,0.19607843137254902,0.19215686274509805,0.18823529411764706,0.1803921568627451,0.19607843137254902,0.18823529411764706,0.1568627450980392,0.1843137254901961,0.17254901960784313,0.17647058823529413,0.2,0.18823529411764706,0.1843137254901961,0.21568627450980393,0.1843137254901961,0.28627450980392155,1.0,1.0,1.0],[1.0,1.0,0.6274509803921569,0.34509803921568627,0.9176470588235294,0.0,0.25882352941176473,0.1803921568627451,0.1843137254901961,0.18823529411764706,0.17254901960784313,0.16470588235294117,0.16470588235294117,0.1843137254901961,0.1843137254901961,0.16470588235294117,0.1803921568627451,0.16470588235294117,0.16862745098039217,0.1803921568627451,0.16862745098039217,0.17254901960784313,0.2235294117647059,0.1843137254901961,0.34509803921568627,1.0,1.0,1.0],[1.0,1.0,1.0,0.3843137254901961,0.8901960784313725,0.03529411764705882,0.1803921568627451,0.18823529411764706,0.19607843137254902,0.1843137254901961,0.1568627450980392,0.15294117647058825,0.11764705882352941,0.12549019607843137,0.1411764705882353,0.13725490196078433,0.13725490196078433,0.1450980392156863,0.17254901960784313,0.17254901960784313,0.1803921568627451,0.1803921568627451,0.21176470588235294,0.1843137254901961,0.5764705882352941,1.0,1.0,1.0],[1.0,1.0,1.0,0.8980392156862745,0.5411764705882353,0.1568627450980392,0.10196078431372549,0.16470588235294117,0.1607843137254902,0.13725490196078433,0.1450980392156863,0.1450980392156863,0.10588235294117647,0.1607843137254902,0.18823529411764706,0.1843137254901961,0.17254901960784313,0.18823529411764706,0.2196078431372549,0.20392156862745098,0.2,0.19215686274509805,0.20392156862745098,0.19215686274509805,0.8784313725490196,1.0,0.996078431372549,1.0],[1.0,0.9882352941176471,1.0,0.5294117647058824,0.0,0.2,0.23921568627450981,0.19607843137254902,0.1411764705882353,0.1607843137254902,0.18823529411764706,0.1450980392156863,0.20392156862745098,0.09803921568627451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12156862745098039,0.14901960784313725,0.2627450980392157,1.0,1.0,0.996078431372549,1.0],[1.0,0.9725490196078431,1.0,0.8117647058823529,0.2784313725490196,0.3411764705882353,0.2901960784313726,0.24313725490196078,0.20784313725490197,0.20784313725490197,0.21176470588235294,0.2196078431372549,0.2196078431372549,0.5882352941176471,0.8392156862745098,0.7725490196078432,0.7764705882352941,0.8,0.8352941176470589,0.8313725490196079,0.8352941176470589,0.8117647058823529,0.9137254901960784,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x21","yaxis":"y21","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"3","z":[[1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.996078431372549,0.996078431372549,1.0,1.0,0.6078431372549019,0.8470588235294118,1.0,0.9803921568627451,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9647058823529412,1.0,0.49411764705882355,0.7725490196078432,0.4627450980392157,0.9450980392156862,1.0,0.9803921568627451,0.9921568627450981,0.9882352941176471,0.9921568627450981,0.9921568627450981,0.996078431372549,0.9921568627450981,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9764705882352941,1.0,0.6705882352941176,0.5019607843137255,1.0,0.8470588235294118,0.24705882352941178,1.0,1.0,0.984313725490196,0.9921568627450981,0.9921568627450981,0.996078431372549,0.996078431372549,0.9921568627450981,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9882352941176471,1.0,0.9137254901960784,0.30980392156862746,1.0,1.0,1.0,0.27058823529411763,0.5019607843137255,1.0,0.9803921568627451,0.984313725490196,0.9921568627450981,0.996078431372549,0.996078431372549,0.9921568627450981,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,1.0,0.40784313725490196,0.27450980392156865,1.0,1.0,1.0,0.3254901960784314,0.34901960784313724,1.0,1.0,0.9882352941176471,0.9921568627450981,0.996078431372549,0.996078431372549,0.9921568627450981,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9882352941176471,1.0,1.0,0.20392156862745098,0.44313725490196076,1.0,0.984313725490196,1.0,0.4627450980392157,0.21568627450980393,1.0,1.0,0.9882352941176471,0.996078431372549,0.996078431372549,0.996078431372549,0.9921568627450981,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9803921568627451,1.0,0.8588235294117647,0.08627450980392157,0.7215686274509804,1.0,0.9647058823529412,1.0,0.7607843137254902,0.09019607843137255,0.8470588235294118,1.0,0.9764705882352941,0.9921568627450981,0.996078431372549,0.996078431372549,0.9921568627450981,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,1.0,0.5568627450980392,0.1450980392156863,1.0,1.0,0.9803921568627451,1.0,1.0,0.12941176470588237,0.5333333333333333,1.0,0.9882352941176471,0.996078431372549,0.996078431372549,0.996078431372549,0.9921568627450981,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,0.9921568627450981,1.0,1.0,0.34509803921568627,0.3215686274509804,1.0,1.0,0.9882352941176471,0.9921568627450981,1.0,0.34509803921568627,0.2627450980392157,1.0,1.0,0.996078431372549,0.996078431372549,0.996078431372549,0.996078431372549,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9764705882352941,0.984313725490196,0.9803921568627451,0.984313725490196,0.9686274509803922,1.0,1.0,0.2,0.5058823529411764,1.0,0.9882352941176471,0.9921568627450981,0.984313725490196,1.0,0.6352941176470588,0.07058823529411765,0.7254901960784313,1.0,0.9725490196078431,0.984313725490196,0.9764705882352941,0.9686274509803922,0.9803921568627451,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7647058823529411,0.0,0.8117647058823529,1.0,0.9647058823529412,0.9686274509803922,0.984313725490196,1.0,1.0,0.13333333333333333,0.3764705882352941,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.9647058823529412,0.9803921568627451,1.0,1.0,1.0,1.0,0.4235294117647059,0.03529411764705882,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.1450980392156863,0.19607843137254902,1.0,0.9725490196078431,0.8588235294117647,0.7764705882352941,0.7607843137254902,0.9450980392156862,1.0,1.0],[1.0,1.0,1.0,0.5176470588235295,0.16862745098039217,0.2627450980392157,0.1411764705882353,0.16470588235294117,0.2549019607843137,0.12941176470588237,0.2196078431372549,0.4980392156862745,0.4117647058823529,0.43137254901960786,0.5686274509803921,0.4666666666666667,0.20392156862745098,0.23137254901960785,0.2,0.1843137254901961,0.12156862745098039,0.054901960784313725,0.20784313725490197,0.20392156862745098,0.19607843137254902,0.5058823529411764,1.0,1.0],[1.0,1.0,1.0,0.9294117647058824,0.2,0.2196078431372549,0.21568627450980393,0.20784313725490197,0.1843137254901961,0.1843137254901961,0.19215686274509805,0.17254901960784313,0.1843137254901961,0.21568627450980393,0.19215686274509805,0.19607843137254902,0.19215686274509805,0.21176470588235294,0.2,0.19215686274509805,0.21176470588235294,0.2235294117647059,0.2549019607843137,0.30196078431372547,0.26666666666666666,0.7803921568627451,1.0,1.0],[1.0,1.0,1.0,1.0,0.18823529411764706,0.16470588235294117,0.1607843137254902,0.16470588235294117,0.15294117647058825,0.13333333333333333,0.13725490196078433,0.17254901960784313,0.19215686274509805,0.19215686274509805,0.16470588235294117,0.2235294117647059,0.1843137254901961,0.1568627450980392,0.13333333333333333,0.12549019607843137,0.1411764705882353,0.1450980392156863,0.19215686274509805,0.21568627450980393,0.19607843137254902,0.8901960784313725,1.0,1.0],[1.0,1.0,1.0,1.0,0.2823529411764706,0.2549019607843137,0.38823529411764707,0.3764705882352941,0.3607843137254902,0.2784313725490196,0.2627450980392157,0.3058823529411765,0.30980392156862746,0.2980392156862745,0.2784313725490196,0.33725490196078434,0.30980392156862746,0.32941176470588235,0.30980392156862746,0.30196078431372547,0.4235294117647059,0.5333333333333333,0.5490196078431373,0.5254901960784314,0.43137254901960786,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.788235294117647,0.611764705882353,0.9607843137254902,0.9098039215686274,0.9333333333333333,0.9372549019607843,0.9333333333333333,0.9254901960784314,0.9254901960784314,0.9450980392156862,0.9450980392156862,0.9372549019607843,0.9490196078431372,0.9803921568627451,0.9725490196078431,0.9607843137254902,0.9490196078431372,0.9725490196078431,0.9764705882352941,0.9490196078431372,0.9490196078431372,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.8823529411764706,0.7529411764705882,0.7490196078431373,0.7607843137254902,0.7725490196078432,0.792156862745098,0.7843137254901961,0.7764705882352941,0.7803921568627451,0.788235294117647,0.792156862745098,0.796078431372549,0.796078431372549,0.796078431372549,0.807843137254902,0.803921568627451,0.803921568627451,0.8117647058823529,0.8235294117647058,0.803921568627451,0.8549019607843137,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9058823529411765,0.6941176470588235,0.7686274509803922,0.7647058823529411,0.7764705882352941,0.7803921568627451,0.788235294117647,0.7803921568627451,0.788235294117647,0.788235294117647,0.7803921568627451,0.7843137254901961,0.788235294117647,0.7843137254901961,0.788235294117647,0.8,0.8156862745098039,0.8274509803921568,0.8392156862745098,0.8313725490196079,0.8901960784313725,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9607843137254902,0.7254901960784313,0.7647058823529411,0.7490196078431373,0.7607843137254902,0.7607843137254902,0.7725490196078432,0.7647058823529411,0.7686274509803922,0.7803921568627451,0.7764705882352941,0.7803921568627451,0.7803921568627451,0.7843137254901961,0.788235294117647,0.796078431372549,0.8117647058823529,0.8156862745098039,0.8352941176470589,0.8235294117647058,0.8941176470588236,1.0,1.0,1.0],[1.0,1.0,1.0,0.996078431372549,1.0,0.7098039215686275,0.7411764705882353,0.7490196078431373,0.7568627450980392,0.7529411764705882,0.7529411764705882,0.7568627450980392,0.7607843137254902,0.7686274509803922,0.7764705882352941,0.7764705882352941,0.7803921568627451,0.7843137254901961,0.8,0.7803921568627451,0.796078431372549,0.8156862745098039,0.8235294117647058,0.792156862745098,0.9098039215686274,1.0,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,1.0,0.7019607843137254,0.7411764705882353,0.7411764705882353,0.7490196078431373,0.7490196078431373,0.7450980392156863,0.7529411764705882,0.7568627450980392,0.7647058823529411,0.7686274509803922,0.7686274509803922,0.7725490196078432,0.7764705882352941,0.7764705882352941,0.7647058823529411,0.8,0.8196078431372549,0.8196078431372549,0.8,0.9450980392156862,1.0,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,1.0,0.7450980392156863,0.7490196078431373,0.7137254901960784,0.7058823529411765,0.7568627450980392,0.7568627450980392,0.7529411764705882,0.7529411764705882,0.7607843137254902,0.7725490196078432,0.7764705882352941,0.788235294117647,0.7843137254901961,0.788235294117647,0.796078431372549,0.7803921568627451,0.7725490196078432,0.8117647058823529,0.8274509803921568,0.9764705882352941,0.996078431372549,1.0,1.0],[1.0,1.0,1.0,0.9803921568627451,1.0,0.8470588235294118,0.7019607843137254,0.7254901960784313,0.7333333333333333,0.7372549019607844,0.7294117647058823,0.7333333333333333,0.7294117647058823,0.7333333333333333,0.7411764705882353,0.7450980392156863,0.7568627450980392,0.7647058823529411,0.7411764705882353,0.7725490196078432,0.8470588235294118,0.8117647058823529,0.7764705882352941,0.8156862745098039,1.0,0.996078431372549,1.0,1.0],[1.0,1.0,1.0,0.9764705882352941,1.0,0.8862745098039215,0.596078431372549,0.7176470588235294,0.7411764705882353,0.7176470588235294,0.7058823529411765,0.7058823529411765,0.7058823529411765,0.7058823529411765,0.7098039215686275,0.7176470588235294,0.7294117647058823,0.7372549019607844,0.7529411764705882,0.7450980392156863,0.8235294117647058,0.8901960784313725,0.7803921568627451,0.7843137254901961,1.0,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9882352941176471,1.0,0.8980392156862745,0.7254901960784313,0.7568627450980392,0.6941176470588235,0.7019607843137254,0.6980392156862745,0.6980392156862745,0.6980392156862745,0.6980392156862745,0.7098039215686275,0.7137254901960784,0.7215686274509804,0.7098039215686275,0.8470588235294118,0.6392156862745098,0.17647058823529413,0.34509803921568627,0.9372549019607843,0.7411764705882353,1.0,0.996078431372549,1.0,1.0],[1.0,1.0,1.0,0.984313725490196,0.996078431372549,0.9529411764705882,0.6509803921568628,0.6313725490196078,0.6352941176470588,0.6352941176470588,0.6352941176470588,0.6352941176470588,0.6352941176470588,0.6352941176470588,0.6392156862745098,0.6352941176470588,0.6392156862745098,0.6235294117647059,0.6705882352941176,0.6313725490196078,0.32941176470588235,0.5490196078431373,0.796078431372549,0.8235294117647058,1.0,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,0.996078431372549,1.0,0.8156862745098039,0.8235294117647058,0.8549019607843137,0.8431372549019608,0.8705882352941177,0.8666666666666667,0.8666666666666667,0.8666666666666667,0.8705882352941177,0.8666666666666667,0.8627450980392157,0.8549019607843137,0.8431372549019608,0.8823529411764706,0.9529411764705882,0.9254901960784314,0.8196078431372549,0.9372549019607843,0.984313725490196,0.9882352941176471,1.0,1.0]],"type":"heatmap","xaxis":"x22","yaxis":"y22","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"4","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7254901960784313,0.5568627450980392,0.5450980392156862,0.6196078431372549,0.5647058823529412,0.5882352941176471,0.5764705882352941,0.7568627450980392,0.6941176470588235,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.8196078431372549,0.49019607843137253,0.24705882352941178,0.25882352941176473,0.12941176470588237,0.36470588235294116,0.2784313725490196,0.24705882352941178,0.3411764705882353,0.08627450980392157,0.396078431372549,0.30980392156862746,0.1843137254901961,0.2784313725490196,0.6313725490196078,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9176470588235294,0.4588235294117647,0.4823529411764706,0.5215686274509804,0.20392156862745098,0.11764705882352941,0.9686274509803922,1.0,1.0,0.9803921568627451,0.9607843137254902,0.6392156862745098,0.10980392156862745,0.4588235294117647,0.5647058823529412,0.5333333333333333,0.5137254901960784,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.4470588235294118,0.5137254901960784,0.5450980392156862,0.7058823529411765,0.6941176470588235,0.6509803921568628,0.5450980392156862,0.7372549019607844,0.7568627450980392,0.5764705882352941,0.5647058823529412,0.5568627450980392,0.4392156862745098,0.5333333333333333,0.5019607843137255,0.5450980392156862,0.25882352941176473,0.7686274509803922,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.4470588235294118,0.2784313725490196,0.2980392156862745,0.36470588235294116,0.3333333333333333,0.47058823529411764,0.6627450980392157,0.7137254901960784,0.7372549019607844,0.6941176470588235,0.4470588235294118,0.788235294117647,0.5215686274509804,0.396078431372549,0.5137254901960784,0.47058823529411764,0.396078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.788235294117647,0.3411764705882353,0.4470588235294118,0.4392156862745098,0.396078431372549,0.5647058823529412,0.6509803921568628,0.5019607843137255,0.5882352941176471,0.8431372549019608,0.17254901960784313,0.7058823529411765,0.6627450980392157,0.6509803921568628,0.5568627450980392,0.37254901960784315,0.4588235294117647,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.42745098039215684,0.40784313725490196,0.4588235294117647,0.5450980392156862,0.5568627450980392,0.4392156862745098,0.5137254901960784,0.7254901960784313,0.9803921568627451,0.6941176470588235,0.35294117647058826,0.4588235294117647,0.3333333333333333,0.5137254901960784,0.4588235294117647,0.9058823529411765,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.6627450980392157,0.3333333333333333,0.42745098039215684,0.30980392156862746,0.5137254901960784,0.8862745098039215,0.8941176470588236,0.9372549019607843,0.4392156862745098,0.7568627450980392,0.9294117647058824,0.6823529411764706,0.41568627450980394,0.7372549019607844,0.6392156862745098,0.8196078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.6078431372549019,0.7058823529411765,0.7686274509803922,0.7803921568627451,0.8862745098039215,0.8941176470588236,0.8745098039215686,0.8745098039215686,0.7058823529411765,0.7803921568627451,0.9372549019607843,0.9372549019607843,0.9607843137254902,0.7254901960784313,0.6627450980392157,0.6078431372549019,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,0.5215686274509804,0.7686274509803922,0.6078431372549019,0.8862745098039215,0.8862745098039215,0.8431372549019608,0.8196078431372549,0.8549019607843137,0.9058823529411765,0.7450980392156863,0.8627450980392157,0.8862745098039215,0.8431372549019608,0.6313725490196078,0.8862745098039215,0.5764705882352941,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9607843137254902,0.5019607843137255,0.9686274509803922,0.17254901960784313,0.6196078431372549,0.9176470588235294,0.8117647058823529,0.8196078431372549,0.8196078431372549,0.8196078431372549,0.7372549019607844,0.8431372549019608,0.9058823529411765,0.6705882352941176,0.2784313725490196,0.9607843137254902,0.5764705882352941,0.9490196078431372,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.8,0.5333333333333333,0.9921568627450981,0.0,0.40784313725490196,0.8745098039215686,0.7803921568627451,0.8,0.8196078431372549,0.7686274509803922,0.7372549019607844,0.8431372549019608,0.8431372549019608,0.5882352941176471,0.396078431372549,0.9058823529411765,0.6392156862745098,0.8549019607843137,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6941176470588235,0.6392156862745098,0.8117647058823529,0.30980392156862746,0.5450980392156862,0.7254901960784313,0.788235294117647,0.7803921568627451,0.8862745098039215,0.5137254901960784,0.7568627450980392,0.8745098039215686,0.7450980392156863,0.7372549019607844,0.3411764705882353,0.8,0.6941176470588235,0.7372549019607844,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6941176470588235,0.7568627450980392,0.6705882352941176,0.14901960784313725,0.7058823529411765,0.6823529411764706,0.788235294117647,0.7803921568627451,0.8,0.8549019607843137,0.7686274509803922,0.8196078431372549,0.7686274509803922,0.7803921568627451,0.3411764705882353,0.6078431372549019,0.7450980392156863,0.7137254901960784,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6313725490196078,0.8627450980392157,0.37254901960784315,0.023529411764705882,0.8941176470588236,0.6313725490196078,0.7803921568627451,0.7686274509803922,0.7686274509803922,0.7686274509803922,0.7372549019607844,0.788235294117647,0.7686274509803922,0.7450980392156863,0.2784313725490196,0.41568627450980394,0.8627450980392157,0.6392156862745098,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6078431372549019,0.8941176470588236,0.24705882352941178,0.23529411764705882,0.9372549019607843,0.6823529411764706,0.7803921568627451,0.8,0.788235294117647,0.8196078431372549,0.7254901960784313,0.8196078431372549,0.7372549019607844,0.8627450980392157,0.2235294117647059,0.2980392156862745,0.9490196078431372,0.6392156862745098,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6196078431372549,0.9176470588235294,0.09803921568627451,0.3333333333333333,0.8862745098039215,0.7254901960784313,0.788235294117647,0.788235294117647,0.7450980392156863,0.7058823529411765,0.7568627450980392,0.8313725490196079,0.6705882352941176,0.9607843137254902,0.2980392156862745,0.14901960784313725,0.9176470588235294,0.6313725490196078,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6392156862745098,0.8862745098039215,0.07450980392156863,0.40784313725490196,0.7686274509803922,0.6823529411764706,0.5647058823529412,0.6509803921568628,0.8,0.4588235294117647,0.7137254901960784,0.5137254901960784,0.7254901960784313,0.7372549019607844,0.5647058823529412,0.23529411764705882,0.8,0.6078431372549019,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6509803921568628,0.8196078431372549,0.1411764705882353,0.4588235294117647,0.8431372549019608,0.9607843137254902,0.6627450980392157,0.5137254901960784,0.7254901960784313,0.8431372549019608,0.6705882352941176,0.36470588235294116,0.8862745098039215,0.8941176470588236,0.7254901960784313,0.42745098039215684,0.7137254901960784,0.6705882352941176,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6196078431372549,0.8549019607843137,0.07450980392156863,0.5215686274509804,0.8196078431372549,0.5882352941176471,0.6078431372549019,0.6509803921568628,0.6941176470588235,0.7137254901960784,0.6509803921568628,0.5215686274509804,0.596078431372549,0.6627450980392157,0.7372549019607844,0.42745098039215684,0.6705882352941176,0.6627450980392157,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.5647058823529412,0.7568627450980392,0.1411764705882353,0.6705882352941176,0.8745098039215686,0.40784313725490196,0.8196078431372549,0.7450980392156863,0.6705882352941176,0.7372549019607844,0.7137254901960784,0.8196078431372549,0.6823529411764706,0.6196078431372549,0.8745098039215686,0.4823529411764706,0.6392156862745098,0.6078431372549019,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.5568627450980392,0.6941176470588235,0.3411764705882353,0.6627450980392157,0.8549019607843137,0.5568627450980392,0.8196078431372549,0.6705882352941176,0.7137254901960784,0.7803921568627451,0.6705882352941176,0.7450980392156863,0.7803921568627451,0.6078431372549019,0.8117647058823529,0.5568627450980392,0.6941176470588235,0.596078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.5568627450980392,0.7058823529411765,0.35294117647058826,0.6823529411764706,0.788235294117647,0.6078431372549019,0.8,0.7058823529411765,0.8117647058823529,0.35294117647058826,0.35294117647058826,0.8431372549019608,0.7254901960784313,0.5647058823529412,0.8549019607843137,0.5568627450980392,0.6392156862745098,0.5647058823529412,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.5647058823529412,0.7058823529411765,0.42745098039215684,0.7372549019607844,0.7450980392156863,0.6196078431372549,0.8431372549019608,0.7058823529411765,0.6313725490196078,0.788235294117647,0.7137254901960784,0.6823529411764706,0.7254901960784313,0.6078431372549019,0.8313725490196079,0.6627450980392157,0.6392156862745098,0.5647058823529412,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.47058823529411764,0.6078431372549019,0.4823529411764706,0.7058823529411765,0.7803921568627451,0.7058823529411765,0.7568627450980392,0.7058823529411765,0.6941176470588235,0.7254901960784313,0.6392156862745098,0.7137254901960784,0.7803921568627451,0.7254901960784313,0.7568627450980392,0.6823529411764706,0.6078431372549019,0.5137254901960784,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.7450980392156863,0.8117647058823529,0.47058823529411764,0.6196078431372549,0.788235294117647,0.7803921568627451,0.7372549019607844,0.7372549019607844,0.6196078431372549,0.7568627450980392,0.5647058823529412,0.6823529411764706,0.8117647058823529,0.788235294117647,0.788235294117647,0.6196078431372549,0.5647058823529412,0.7372549019607844,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5764705882352941,0.6078431372549019,0.7450980392156863,0.7137254901960784,0.6823529411764706,0.6823529411764706,0.6196078431372549,0.7254901960784313,0.4588235294117647,0.5764705882352941,0.7803921568627451,0.7254901960784313,0.7058823529411765,0.5647058823529412,0.788235294117647,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,0.9176470588235294,0.8549019607843137,0.8117647058823529,0.788235294117647,0.7686274509803922,0.6196078431372549,0.4392156862745098,0.8,0.8431372549019608,0.8862745098039215,0.9058823529411765,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x23","yaxis":"y23","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"5","z":[[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,0.3803921568627451,0.32941176470588235,0.3607843137254902,0.44313725490196076,0.4392156862745098,0.9254901960784314,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.8627450980392157,0.8,0.4,0.10196078431372549,0.27450980392156865,0.35294117647058826,0.2549019607843137,0.09411764705882353,0.29411764705882354,0.7803921568627451,0.7490196078431373,1.0,1.0,0.9921568627450981,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5333333333333333,0.43137254901960786,0.3058823529411765,0.41568627450980394,0.06666666666666667,0.12941176470588237,0.23137254901960785,0.11372549019607843,0.023529411764705882,0.37254901960784315,0.35294117647058826,0.4,0.48627450980392156,0.9333333333333333,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5568627450980392,0.5333333333333333,0.5450980392156862,0.5372549019607843,0.29411764705882354,0.0,0.00392156862745098,0.0,0.11764705882352941,0.5647058823529412,0.4588235294117647,0.5686274509803921,0.5333333333333333,0.8431372549019608,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.7647058823529411,0.49019607843137253,0.5137254901960784,0.47843137254901963,0.3764705882352941,0.6235294117647059,0.11764705882352941,0.0196078431372549,0.00392156862745098,0.3333333333333333,0.5647058823529412,0.38823529411764707,0.5568627450980392,0.5058823529411764,0.5098039215686274,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.6745098039215687,0.5215686274509804,0.5647058823529412,0.5098039215686274,0.42745098039215684,0.5725490196078431,0.42745098039215684,0.00392156862745098,0.08235294117647059,0.5411764705882353,0.4117647058823529,0.4666666666666667,0.5725490196078431,0.5294117647058824,0.5019607843137255,0.9764705882352941,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.592156862745098,0.5176470588235295,0.5568627450980392,0.5137254901960784,0.403921568627451,0.35294117647058826,0.47058823529411764,0.0196078431372549,0.16470588235294117,0.5098039215686274,0.37254901960784315,0.5529411764705883,0.5294117647058824,0.5372549019607843,0.5137254901960784,0.8862745098039215,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.5686274509803921,0.4666666666666667,0.4235294117647059,0.47843137254901963,0.5686274509803921,0.5294117647058824,0.7607843137254902,0.1450980392156863,0.43529411764705883,0.6627450980392157,0.5529411764705883,0.5529411764705883,0.4196078431372549,0.5098039215686274,0.4627450980392157,0.7098039215686275,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.37254901960784315,0.3803921568627451,0.7450980392156863,0.7607843137254902,0.7215686274509804,0.6470588235294118,0.7803921568627451,0.12941176470588237,0.5568627450980392,0.7411764705882353,0.7215686274509804,0.8274509803921568,0.7568627450980392,0.6235294117647059,0.3686274509803922,0.6549019607843137,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.7803921568627451,0.4666666666666667,0.5725490196078431,0.6549019607843137,0.5803921568627451,0.5647058823529412,0.6078431372549019,0.3607843137254902,0.47843137254901963,0.5803921568627451,0.6392156862745098,0.6470588235294118,0.6509803921568628,0.6392156862745098,0.5568627450980392,0.8235294117647058,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9803921568627451,0.6588235294117647,0.43529411764705883,0.4117647058823529,0.6078431372549019,0.4980392156862745,0.5333333333333333,0.5333333333333333,0.6274509803921569,0.4627450980392157,0.5176470588235295,0.5843137254901961,0.5137254901960784,0.5607843137254902,0.5568627450980392,0.5294117647058824,0.6392156862745098,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9058823529411765,0.5411764705882353,0.3411764705882353,0.3686274509803922,0.8549019607843137,0.6901960784313725,0.6980392156862745,0.6705882352941176,0.6549019607843137,0.5294117647058824,0.7058823529411765,0.7490196078431373,0.6627450980392157,0.7647058823529411,0.4745098039215686,0.49019607843137253,0.5215686274509804,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9764705882352941,0.6862745098039216,0.3568627450980392,0.23137254901960785,0.5333333333333333,0.34509803921568627,0.34901960784313724,0.3137254901960784,0.2784313725490196,0.28627450980392155,0.35294117647058826,0.43137254901960786,0.43137254901960786,0.6196078431372549,0.4196078431372549,0.5764705882352941,0.6941176470588235,0.9686274509803922,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.7686274509803922,0.4588235294117647,0.30196078431372547,0.2235294117647059,0.5450980392156862,0.47843137254901963,0.5372549019607843,0.5490196078431373,0.596078431372549,0.6039215686274509,0.47058823529411764,0.4666666666666667,0.396078431372549,0.41568627450980394,0.35294117647058826,0.4588235294117647,0.3215686274509804,0.7098039215686275,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.7254901960784313,0.37254901960784315,0.22745098039215686,0.47058823529411764,0.8156862745098039,0.611764705882353,0.6235294117647059,0.5843137254901961,0.5607843137254902,0.5372549019607843,0.6,0.6823529411764706,0.6941176470588235,0.803921568627451,0.3058823529411765,0.3843137254901961,0.4392156862745098,0.8117647058823529,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.8901960784313725,0.6745098039215687,0.3568627450980392,0.5176470588235295,0.49019607843137253,0.37254901960784315,0.41568627450980394,0.4196078431372549,0.45098039215686275,0.41568627450980394,0.35294117647058826,0.40784313725490196,0.37254901960784315,0.5529411764705883,0.5058823529411764,0.6039215686274509,0.6039215686274509,0.803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6862745098039216,0.3764705882352941,0.27450980392156865,0.592156862745098,0.615686274509804,0.5215686274509804,0.5372549019607843,0.5176470588235295,0.5333333333333333,0.49411764705882355,0.47843137254901963,0.5607843137254902,0.5254901960784314,0.7058823529411765,0.43137254901960786,0.28627450980392155,0.3843137254901961,0.6980392156862745,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.7215686274509804,0.4980392156862745,0.2980392156862745,0.611764705882353,0.5333333333333333,0.48627450980392156,0.5019607843137255,0.4745098039215686,0.49411764705882355,0.4745098039215686,0.3686274509803922,0.4745098039215686,0.45098039215686275,0.6078431372549019,0.5294117647058824,0.4117647058823529,0.5019607843137255,0.6823529411764706,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6549019607843137,0.4588235294117647,0.2980392156862745,0.5686274509803921,0.40784313725490196,0.4,0.403921568627451,0.3803921568627451,0.3686274509803922,0.39215686274509803,0.34509803921568627,0.41568627450980394,0.4470588235294118,0.5843137254901961,0.6039215686274509,0.3843137254901961,0.48627450980392156,0.6823529411764706,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.4627450980392157,0.4235294117647059,0.30980392156862746,0.6823529411764706,0.5098039215686274,0.49411764705882355,0.4980392156862745,0.5137254901960784,0.5607843137254902,0.5254901960784314,0.4235294117647059,0.4470588235294118,0.43137254901960786,0.4980392156862745,0.6470588235294118,0.34901960784313724,0.43137254901960786,0.4980392156862745,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6862745098039216,0.43529411764705883,0.34509803921568627,0.7725490196078432,0.7176470588235294,0.6901960784313725,0.6784313725490196,0.6666666666666666,0.6823529411764706,0.6509803921568628,0.592156862745098,0.6549019607843137,0.6862745098039216,0.8196078431372549,0.8274509803921568,0.20784313725490197,0.4666666666666667,0.7137254901960784,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,0.9568627450980393,0.8509803921568627,0.6705882352941176,0.6039215686274509,0.615686274509804,0.592156862745098,0.5843137254901961,0.592156862745098,0.5882352941176471,0.5019607843137255,0.5607843137254902,0.596078431372549,0.6274509803921569,0.803921568627451,0.9764705882352941,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.592156862745098,0.4823529411764706,0.5058823529411764,0.5137254901960784,0.47058823529411764,0.4823529411764706,0.48627450980392156,0.4,0.4392156862745098,0.5019607843137255,0.5215686274509804,0.7764705882352941,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.7725490196078432,0.6431372549019608,0.6196078431372549,0.6039215686274509,0.6078431372549019,0.6196078431372549,0.6235294117647059,0.5490196078431373,0.6235294117647059,0.7215686274509804,0.7098039215686275,0.8823529411764706,1.0,0.996078431372549,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5058823529411764,0.592156862745098,0.6627450980392157,0.6745098039215687,0.6235294117647059,0.5568627450980392,0.5176470588235295,0.4392156862745098,0.49411764705882355,0.5333333333333333,0.4745098039215686,0.6470588235294118,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5058823529411764,0.3803921568627451,0.4117647058823529,0.2980392156862745,0.2823529411764706,0.28627450980392155,0.30980392156862746,0.27450980392156865,0.40784313725490196,0.4588235294117647,0.4980392156862745,0.7372549019607844,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.6901960784313725,0.611764705882353,0.6470588235294118,0.6196078431372549,0.6078431372549019,0.6627450980392157,0.6745098039215687,0.5372549019607843,0.611764705882353,0.6470588235294118,0.5803921568627451,0.796078431372549,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.792156862745098,0.6627450980392157,0.5725490196078431,0.5568627450980392,0.48627450980392156,0.4666666666666667,0.4588235294117647,0.47843137254901963,0.596078431372549,0.6470588235294118,0.7607843137254902,0.9803921568627451,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x24","yaxis":"y24","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"6","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,0.7176470588235294,0.16862745098039217,0.16862745098039217,0.43529411764705883,0.2901960784313726,0.06666666666666667,0.5254901960784314,0.996078431372549,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.5568627450980392,0.24705882352941178,0.1450980392156863,0.1803921568627451,0.1411764705882353,0.49019607843137253,0.3176470588235294,0.1843137254901961,0.13333333333333333,0.1411764705882353,0.4588235294117647,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3058823529411765,0.23921568627450981,0.27450980392156865,0.3058823529411765,0.28627450980392155,0.2627450980392157,0.16470588235294117,0.23137254901960785,0.28627450980392155,0.28627450980392155,0.25882352941176473,0.21176470588235294,0.25098039215686274,0.9450980392156862,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5568627450980392,0.10588235294117647,0.3254901960784314,0.30196078431372547,0.30196078431372547,0.30196078431372547,0.30196078431372547,0.32941176470588235,0.3137254901960784,0.30196078431372547,0.2901960784313726,0.3137254901960784,0.2901960784313726,0.08627450980392157,0.5019607843137255,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2235294117647059,0.1450980392156863,0.2784313725490196,0.2980392156862745,0.2980392156862745,0.30196078431372547,0.30196078431372547,0.2980392156862745,0.2980392156862745,0.3058823529411765,0.26666666666666666,0.2980392156862745,0.25882352941176473,0.12941176470588237,0.2784313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.10588235294117647,0.19215686274509805,0.27450980392156865,0.30196078431372547,0.27450980392156865,0.2784313725490196,0.28627450980392155,0.28627450980392155,0.2784313725490196,0.2784313725490196,0.25882352941176473,0.2980392156862745,0.23921568627450981,0.1450980392156863,0.19607843137254902,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9450980392156862,0.06274509803921569,0.20784313725490197,0.2196078431372549,0.28627450980392155,0.27450980392156865,0.27450980392156865,0.26666666666666666,0.2980392156862745,0.2784313725490196,0.2627450980392157,0.2627450980392157,0.2784313725490196,0.23137254901960785,0.16862745098039217,0.10196078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.8509803921568627,0.08627450980392157,0.20784313725490197,0.23137254901960785,0.2980392156862745,0.2784313725490196,0.2784313725490196,0.2784313725490196,0.2901960784313726,0.2784313725490196,0.26666666666666666,0.25098039215686274,0.26666666666666666,0.21176470588235294,0.1803921568627451,0.0392156862745098,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.7686274509803922,0.10196078431372549,0.2196078431372549,0.23529411764705882,0.3058823529411765,0.2784313725490196,0.28627450980392155,0.2901960784313726,0.2980392156862745,0.27450980392156865,0.2627450980392157,0.25882352941176473,0.27450980392156865,0.2196078431372549,0.1843137254901961,0.0392156862745098,0.9568627450980393,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.592156862745098,0.11372549019607843,0.2196078431372549,0.23137254901960785,0.3058823529411765,0.27450980392156865,0.2784313725490196,0.2980392156862745,0.28627450980392155,0.27450980392156865,0.2627450980392157,0.25882352941176473,0.28627450980392155,0.2235294117647059,0.1803921568627451,0.03137254901960784,0.9019607843137255,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.5647058823529412,0.12549019607843137,0.20784313725490197,0.2235294117647059,0.3058823529411765,0.2627450980392157,0.26666666666666666,0.28627450980392155,0.28627450980392155,0.2784313725490196,0.2627450980392157,0.25098039215686274,0.27450980392156865,0.23137254901960785,0.1803921568627451,0.0392156862745098,0.7725490196078432,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.592156862745098,0.2235294117647059,0.20784313725490197,0.20784313725490197,0.3176470588235294,0.27450980392156865,0.2784313725490196,0.2980392156862745,0.28627450980392155,0.2784313725490196,0.26666666666666666,0.25882352941176473,0.26666666666666666,0.23137254901960785,0.1803921568627451,0.16470588235294117,0.6784313725490196,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.3803921568627451,0.19215686274509805,0.1803921568627451,0.2196078431372549,0.30196078431372547,0.27450980392156865,0.2784313725490196,0.2784313725490196,0.26666666666666666,0.26666666666666666,0.25098039215686274,0.2627450980392157,0.25098039215686274,0.23921568627450981,0.1803921568627451,0.12549019607843137,0.47843137254901963,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.4980392156862745,0.25098039215686274,0.23137254901960785,0.15294117647058825,0.2980392156862745,0.27450980392156865,0.2784313725490196,0.27450980392156865,0.26666666666666666,0.2627450980392157,0.25098039215686274,0.26666666666666666,0.25882352941176473,0.2,0.12941176470588237,0.0784313725490196,0.396078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.1803921568627451,0.2627450980392157,0.28627450980392155,0.2784313725490196,0.2784313725490196,0.27450980392156865,0.2627450980392157,0.25098039215686274,0.24705882352941178,0.26666666666666666,0.058823529411764705,0.8862745098039215,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.10196078431372549,0.2627450980392157,0.2784313725490196,0.2784313725490196,0.27450980392156865,0.2627450980392157,0.25882352941176473,0.2627450980392157,0.23921568627450981,0.24705882352941178,0.0,0.9137254901960784,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.984313725490196,0.0784313725490196,0.2627450980392157,0.27450980392156865,0.27450980392156865,0.27450980392156865,0.2627450980392157,0.25098039215686274,0.26666666666666666,0.24705882352941178,0.24705882352941178,0.16470588235294117,0.8666666666666667,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.9725490196078431,0.2196078431372549,0.27450980392156865,0.27450980392156865,0.27450980392156865,0.27450980392156865,0.26666666666666666,0.25882352941176473,0.25882352941176473,0.24705882352941178,0.24705882352941178,0.16470588235294117,0.8784313725490196,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.9333333333333333,0.20784313725490197,0.26666666666666666,0.27450980392156865,0.27450980392156865,0.27450980392156865,0.27450980392156865,0.25882352941176473,0.25098039215686274,0.25098039215686274,0.24705882352941178,0.16470588235294117,0.8745098039215686,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.8901960784313725,0.19215686274509805,0.26666666666666666,0.26666666666666666,0.27450980392156865,0.2784313725490196,0.27450980392156865,0.2627450980392157,0.24705882352941178,0.25098039215686274,0.24705882352941178,0.1568627450980392,0.8666666666666667,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.8745098039215686,0.19215686274509805,0.27450980392156865,0.26666666666666666,0.27450980392156865,0.2784313725490196,0.27450980392156865,0.2627450980392157,0.24705882352941178,0.23529411764705882,0.25098039215686274,0.15294117647058825,0.8509803921568627,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.807843137254902,0.1803921568627451,0.26666666666666666,0.26666666666666666,0.27450980392156865,0.2784313725490196,0.27450980392156865,0.2627450980392157,0.24705882352941178,0.23137254901960785,0.25098039215686274,0.1450980392156863,0.8313725490196079,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.7294117647058823,0.16862745098039217,0.2627450980392157,0.26666666666666666,0.26666666666666666,0.27450980392156865,0.26666666666666666,0.2627450980392157,0.23529411764705882,0.23529411764705882,0.24705882352941178,0.15294117647058825,0.807843137254902,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.6745098039215687,0.16862745098039217,0.2627450980392157,0.2627450980392157,0.26666666666666666,0.26666666666666666,0.27450980392156865,0.26666666666666666,0.24705882352941178,0.23529411764705882,0.23921568627450981,0.15294117647058825,0.7529411764705882,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.6196078431372549,0.17254901960784313,0.26666666666666666,0.2784313725490196,0.28627450980392155,0.2901960784313726,0.2784313725490196,0.2784313725490196,0.25882352941176473,0.23137254901960785,0.23529411764705882,0.1450980392156863,0.7019607843137254,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.5333333333333333,0.17254901960784313,0.27450980392156865,0.26666666666666666,0.28627450980392155,0.26666666666666666,0.24705882352941178,0.25882352941176473,0.2627450980392157,0.23137254901960785,0.23137254901960785,0.1450980392156863,0.6196078431372549,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.4666666666666667,0.16470588235294117,0.26666666666666666,0.25098039215686274,0.25098039215686274,0.24705882352941178,0.23529411764705882,0.21176470588235294,0.2,0.19607843137254902,0.2196078431372549,0.11764705882352941,0.5411764705882353,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.5529411764705883,0.24705882352941178,0.3137254901960784,0.27450980392156865,0.26666666666666666,0.3176470588235294,0.34509803921568627,0.3254901960784314,0.30196078431372547,0.3176470588235294,0.3254901960784314,0.26666666666666666,0.6313725490196078,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x13","yaxis":"y13","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"7","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8352941176470589,0.3254901960784314,0.3176470588235294,0.8784313725490196,1.0,1.0,1.0,0.7372549019607844,0.3176470588235294,0.3843137254901961,0.9529411764705882,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3764705882352941,0.0392156862745098,0.10588235294117647,0.08235294117647059,0.0,0.0,0.0,0.0,0.0,0.06666666666666667,0.09019607843137255,0.054901960784313725,0.4745098039215686,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,0.12156862745098039,0.1450980392156863,0.12549019607843137,0.13725490196078433,0.13725490196078433,0.11372549019607843,0.10196078431372549,0.11764705882352941,0.13725490196078433,0.13725490196078433,0.13333333333333333,0.10980392156862745,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7529411764705882,0.10196078431372549,0.1568627450980392,0.13725490196078433,0.10980392156862745,0.10588235294117647,0.12156862745098039,0.12941176470588237,0.12156862745098039,0.11764705882352941,0.11372549019607843,0.12156862745098039,0.1411764705882353,0.10980392156862745,0.8784313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6196078431372549,0.09803921568627451,0.1607843137254902,0.17254901960784313,0.12941176470588237,0.12156862745098039,0.11764705882352941,0.11764705882352941,0.11372549019607843,0.10980392156862745,0.10980392156862745,0.1450980392156863,0.15294117647058825,0.09019607843137255,0.7568627450980392,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.37254901960784315,0.09019607843137255,0.0784313725490196,0.12941176470588237,0.1607843137254902,0.1450980392156863,0.1450980392156863,0.13725490196078433,0.1450980392156863,0.14901960784313725,0.16470588235294117,0.12941176470588237,0.09019607843137255,0.06666666666666667,0.3764705882352941,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4745098039215686,0.00392156862745098,0.03529411764705882,0.22745098039215686,0.12549019607843137,0.1843137254901961,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.1843137254901961,0.10196078431372549,0.2549019607843137,0.0196078431372549,0.0,0.5333333333333333,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7137254901960784,0.00784313725490196,0.19215686274509805,0.1568627450980392,0.1607843137254902,0.15294117647058825,0.19607843137254902,0.00784313725490196,0.611764705882353,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7764705882352941,0.00784313725490196,0.1803921568627451,0.17254901960784313,0.16470588235294117,0.1568627450980392,0.16862745098039217,0.09019607843137255,0.615686274509804,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.7725490196078432,0.10196078431372549,0.1843137254901961,0.15294117647058825,0.16470588235294117,0.1568627450980392,0.16862745098039217,0.09411764705882353,0.611764705882353,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.6745098039215687,0.10980392156862745,0.17647058823529413,0.1568627450980392,0.1568627450980392,0.15294117647058825,0.17254901960784313,0.09803921568627451,0.5176470588235295,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5137254901960784,0.10588235294117647,0.16862745098039217,0.16862745098039217,0.1568627450980392,0.15294117647058825,0.17254901960784313,0.10196078431372549,0.38823529411764707,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3215686274509804,0.11372549019607843,0.16862745098039217,0.1568627450980392,0.1568627450980392,0.1568627450980392,0.17254901960784313,0.13725490196078433,0.12549019607843137,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.10980392156862745,0.1450980392156863,0.16470588235294117,0.15294117647058825,0.1568627450980392,0.1568627450980392,0.15294117647058825,0.16470588235294117,0.12549019607843137,0.8784313725490196,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.9372549019607843,0.15294117647058825,0.16470588235294117,0.16470588235294117,0.15294117647058825,0.1568627450980392,0.1568627450980392,0.14901960784313725,0.16862745098039217,0.09411764705882353,0.7254901960784313,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8470588235294118,0.11372549019607843,0.16862745098039217,0.1607843137254902,0.1450980392156863,0.1568627450980392,0.1568627450980392,0.14901960784313725,0.17254901960784313,0.09019607843137255,0.5764705882352941,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8117647058823529,0.10588235294117647,0.16470588235294117,0.1568627450980392,0.15294117647058825,0.1607843137254902,0.1568627450980392,0.15294117647058825,0.16862745098039217,0.09411764705882353,0.48627450980392156,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.7529411764705882,0.10588235294117647,0.16862745098039217,0.13725490196078433,0.14901960784313725,0.14901960784313725,0.15294117647058825,0.1411764705882353,0.16862745098039217,0.09411764705882353,0.4,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.6980392156862745,0.09803921568627451,0.17647058823529413,0.15294117647058825,0.1607843137254902,0.16470588235294117,0.1568627450980392,0.1568627450980392,0.16862745098039217,0.09019607843137255,0.3686274509803922,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.7294117647058823,0.08627450980392157,0.12941176470588237,0.12549019607843137,0.07450980392156863,0.09019607843137255,0.07450980392156863,0.08235294117647059,0.1450980392156863,0.06666666666666667,0.3215686274509804,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.8470588235294118,0.30980392156862746,0.3215686274509804,0.30980392156862746,0.2784313725490196,0.26666666666666666,0.2549019607843137,0.2627450980392157,0.27450980392156865,0.2549019607843137,0.5215686274509804,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.8196078431372549,0.35294117647058826,0.4980392156862745,0.4823529411764706,0.48627450980392156,0.4745098039215686,0.4745098039215686,0.4745098039215686,0.4980392156862745,0.39215686274509803,0.5647058823529412,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.8470588235294118,0.2980392156862745,0.4196078431372549,0.38823529411764707,0.3843137254901961,0.38823529411764707,0.3843137254901961,0.39215686274509803,0.396078431372549,0.3058823529411765,0.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.8509803921568627,0.29411764705882354,0.4196078431372549,0.403921568627451,0.4,0.39215686274509803,0.39215686274509803,0.4,0.4117647058823529,0.34901960784313724,0.6431372549019608,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8627450980392157,0.3607843137254902,0.403921568627451,0.38823529411764707,0.38823529411764707,0.38823529411764707,0.38823529411764707,0.39215686274509803,0.396078431372549,0.3333333333333333,0.5764705882352941,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.8823529411764706,0.3686274509803922,0.403921568627451,0.4,0.4117647058823529,0.403921568627451,0.38823529411764707,0.403921568627451,0.396078431372549,0.3254901960784314,0.6745098039215687,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.3333333333333333,0.34901960784313724,0.3215686274509804,0.3333333333333333,0.3254901960784314,0.3254901960784314,0.3215686274509804,0.3333333333333333,0.23137254901960785,0.8823529411764706,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9725490196078431,0.6666666666666666,0.6352941176470588,0.611764705882353,0.6313725490196078,0.6274509803921569,0.6235294117647059,0.7098039215686275,0.984313725490196,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x14","yaxis":"y14","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"8","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6392156862745098,0.30980392156862746,0.20392156862745098,0.3137254901960784,0.37254901960784315,0.34509803921568627,0.19215686274509805,0.11372549019607843,0.5294117647058824,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.9019607843137255,0.41568627450980394,0.054901960784313725,0.09803921568627451,0.22745098039215686,0.1450980392156863,0.11372549019607843,0.2549019607843137,0.09803921568627451,0.13725490196078433,0.2,0.15294117647058825,0.050980392156862744,0.5450980392156862,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.8392156862745098,0.12156862745098039,0.06274509803921569,0.23921568627450981,0.3215686274509804,0.30196078431372547,0.29411764705882354,0.25098039215686274,0.23137254901960785,0.26666666666666666,0.2627450980392157,0.2549019607843137,0.28627450980392155,0.21568627450980393,0.058823529411764705,0.12549019607843137,0.8549019607843137,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.39215686274509803,0.07450980392156863,0.23137254901960785,0.3215686274509804,0.24705882352941178,0.16862745098039217,0.1450980392156863,0.1803921568627451,0.1843137254901961,0.1607843137254902,0.10588235294117647,0.13725490196078433,0.16470588235294117,0.28627450980392155,0.28627450980392155,0.09411764705882353,0.27450980392156865,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.19215686274509805,0.1411764705882353,0.3215686274509804,0.24705882352941178,0.3568627450980392,0.5294117647058824,0.8,0.8509803921568627,0.9333333333333333,0.8627450980392157,0.8235294117647058,0.4745098039215686,0.34509803921568627,0.13725490196078433,0.30980392156862746,0.2235294117647059,0.13333333333333333,0.9333333333333333,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9058823529411765,0.09803921568627451,0.20784313725490197,0.2901960784313726,0.6,0.38823529411764707,0.6627450980392157,0.9372549019607843,0.6196078431372549,0.13333333333333333,0.8666666666666667,0.9568627450980393,0.5607843137254902,0.5529411764705883,0.6235294117647059,0.19607843137254902,0.22745098039215686,0.09411764705882353,0.6823529411764706,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6470588235294118,0.09019607843137255,0.1568627450980392,0.6745098039215687,0.39215686274509803,0.0,0.29411764705882354,0.9529411764705882,0.4196078431372549,0.00784313725490196,0.7372549019607844,0.9764705882352941,0.1843137254901961,0.01568627450980392,0.6078431372549019,0.6784313725490196,0.1568627450980392,0.09803921568627451,0.4235294117647059,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.396078431372549,0.0784313725490196,0.2549019607843137,0.6509803921568628,0.00392156862745098,0.14901960784313725,0.12156862745098039,0.1411764705882353,0.2549019607843137,0.2823529411764706,0.3058823529411765,0.25882352941176473,0.27058823529411763,0.3137254901960784,0.09019607843137255,0.7803921568627451,0.20784313725490197,0.10980392156862745,0.26666666666666666,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.30196078431372547,0.0784313725490196,0.29411764705882354,0.5764705882352941,0.01568627450980392,0.10588235294117647,0.10588235294117647,0.13725490196078433,0.21176470588235294,0.30980392156862746,0.29411764705882354,0.29411764705882354,0.24705882352941178,0.2980392156862745,0.1568627450980392,0.6588235294117647,0.2980392156862745,0.12941176470588237,0.27450980392156865,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.10980392156862745,0.13333333333333333,0.12549019607843137,0.6509803921568628,0.0,0.30196078431372547,0.403921568627451,0.22745098039215686,0.3058823529411765,0.11372549019607843,0.5450980392156862,0.3254901960784314,0.5686274509803921,0.3176470588235294,0.1843137254901961,0.7019607843137254,0.1411764705882353,0.1450980392156863,0.21176470588235294,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.2235294117647059,0.25882352941176473,0.15294117647058825,0.396078431372549,0.4392156862745098,0.6941176470588235,0.9098039215686274,0.9176470588235294,0.6862745098039216,0.23921568627450981,0.9686274509803922,0.9607843137254902,0.9725490196078431,0.6470588235294118,0.6862745098039216,0.4392156862745098,0.1568627450980392,0.2196078431372549,0.10980392156862745,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9803921568627451,0.1568627450980392,0.2549019607843137,0.18823529411764706,0.29411764705882354,0.29411764705882354,0.6,0.9215686274509803,0.9137254901960784,0.9333333333333333,0.9647058823529412,0.9607843137254902,0.9764705882352941,0.9568627450980393,0.6666666666666666,0.3176470588235294,0.20392156862745098,0.2549019607843137,0.23529411764705882,0.08627450980392157,0.8392156862745098,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9098039215686274,0.15294117647058825,0.24705882352941178,0.1803921568627451,0.34509803921568627,0.23137254901960785,0.16862745098039217,0.23921568627450981,0.37254901960784315,0.5215686274509804,0.5529411764705883,0.4588235294117647,0.403921568627451,0.20784313725490197,0.12941176470588237,0.2,0.3215686274509804,0.2196078431372549,0.21568627450980393,0.08235294117647059,0.7568627450980392,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.796078431372549,0.13725490196078433,0.23921568627450981,0.16862745098039217,0.33725490196078434,0.2901960784313726,0.3058823529411765,0.24705882352941178,0.20784313725490197,0.14901960784313725,0.1450980392156863,0.1607843137254902,0.19607843137254902,0.2627450980392157,0.29411764705882354,0.27058823529411763,0.29411764705882354,0.22745098039215686,0.20392156862745098,0.09411764705882353,0.7058823529411765,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7137254901960784,0.13725490196078433,0.23137254901960785,0.17254901960784313,0.3215686274509804,0.27450980392156865,0.2627450980392157,0.27058823529411763,0.3058823529411765,0.2980392156862745,0.30196078431372547,0.29411764705882354,0.29411764705882354,0.28627450980392155,0.27058823529411763,0.23921568627450981,0.30196078431372547,0.24313725490196078,0.18823529411764706,0.12156862745098039,0.6470588235294118,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6588235294117647,0.1568627450980392,0.2235294117647059,0.17647058823529413,0.3176470588235294,0.27450980392156865,0.27058823529411763,0.27450980392156865,0.28627450980392155,0.2823529411764706,0.27058823529411763,0.27450980392156865,0.2627450980392157,0.26666666666666666,0.26666666666666666,0.25098039215686274,0.29411764705882354,0.2235294117647059,0.19215686274509805,0.1607843137254902,0.5725490196078431,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.611764705882353,0.15294117647058825,0.2196078431372549,0.16862745098039217,0.30980392156862746,0.2627450980392157,0.27058823529411763,0.27450980392156865,0.2823529411764706,0.2784313725490196,0.26666666666666666,0.26666666666666666,0.2627450980392157,0.26666666666666666,0.2784313725490196,0.24313725490196078,0.27450980392156865,0.21176470588235294,0.1803921568627451,0.16470588235294117,0.5254901960784314,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5490196078431373,0.16470588235294117,0.23137254901960785,0.1607843137254902,0.30196078431372547,0.24705882352941178,0.25882352941176473,0.26666666666666666,0.26666666666666666,0.2627450980392157,0.26666666666666666,0.25882352941176473,0.25882352941176473,0.2549019607843137,0.27450980392156865,0.25098039215686274,0.27058823529411763,0.20784313725490197,0.1803921568627451,0.1843137254901961,0.47058823529411764,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5058823529411764,0.1843137254901961,0.22745098039215686,0.15294117647058825,0.29411764705882354,0.2549019607843137,0.26666666666666666,0.2549019607843137,0.25098039215686274,0.25098039215686274,0.2549019607843137,0.2627450980392157,0.2549019607843137,0.25098039215686274,0.2627450980392157,0.25882352941176473,0.27450980392156865,0.21176470588235294,0.1843137254901961,0.2,0.43529411764705883,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4588235294117647,0.19607843137254902,0.21176470588235294,0.1607843137254902,0.30196078431372547,0.2627450980392157,0.2627450980392157,0.24705882352941178,0.25098039215686274,0.25882352941176473,0.25882352941176473,0.25882352941176473,0.24705882352941178,0.2549019607843137,0.2627450980392157,0.2627450980392157,0.2823529411764706,0.21176470588235294,0.18823529411764706,0.20784313725490197,0.41568627450980394,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4,0.2,0.19607843137254902,0.1411764705882353,0.2823529411764706,0.23137254901960785,0.2235294117647059,0.2235294117647059,0.23137254901960785,0.23137254901960785,0.2549019607843137,0.23921568627450981,0.23137254901960785,0.23921568627450981,0.23921568627450981,0.24705882352941178,0.2784313725490196,0.18823529411764706,0.19215686274509805,0.20392156862745098,0.396078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.37254901960784315,0.2196078431372549,0.2,0.13725490196078433,0.27058823529411763,0.20784313725490197,0.21568627450980393,0.2235294117647059,0.2196078431372549,0.21176470588235294,0.2196078431372549,0.21176470588235294,0.2235294117647059,0.2196078431372549,0.20392156862745098,0.21568627450980393,0.25882352941176473,0.16470588235294117,0.1843137254901961,0.2,0.3764705882352941,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.3686274509803922,0.18823529411764706,0.20392156862745098,0.1450980392156863,0.2823529411764706,0.19215686274509805,0.2235294117647059,0.23529411764705882,0.23137254901960785,0.23137254901960785,0.22745098039215686,0.22745098039215686,0.23137254901960785,0.2196078431372549,0.2235294117647059,0.22745098039215686,0.25882352941176473,0.1803921568627451,0.1843137254901961,0.1803921568627451,0.34509803921568627,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5333333333333333,0.19215686274509805,0.19215686274509805,0.1607843137254902,0.27058823529411763,0.19607843137254902,0.2196078431372549,0.2196078431372549,0.20392156862745098,0.2,0.20392156862745098,0.20392156862745098,0.20392156862745098,0.20392156862745098,0.20784313725490197,0.20392156862745098,0.24313725490196078,0.16862745098039217,0.1803921568627451,0.18823529411764706,0.4470588235294118,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5450980392156862,0.10588235294117647,0.1450980392156863,0.06274509803921569,0.20784313725490197,0.19215686274509805,0.20392156862745098,0.2,0.19607843137254902,0.19607843137254902,0.19215686274509805,0.19215686274509805,0.19215686274509805,0.2,0.20392156862745098,0.19607843137254902,0.21568627450980393,0.12156862745098039,0.13725490196078433,0.1607843137254902,0.49411764705882355,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7137254901960784,0.18823529411764706,0.30196078431372547,0.38823529411764707,0.12156862745098039,0.10588235294117647,0.11764705882352941,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.11764705882352941,0.12156862745098039,0.11764705882352941,0.12549019607843137,0.12549019607843137,0.12941176470588237,0.12549019607843137,0.22745098039215686,0.23137254901960785,0.0,0.49019607843137253,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5882352941176471,0.4588235294117647,0.4196078431372549,0.4196078431372549,0.4196078431372549,0.4196078431372549,0.4196078431372549,0.4196078431372549,0.4196078431372549,0.4235294117647059,0.4235294117647059,0.43529411764705883,0.3764705882352941,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x15","yaxis":"y15","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"9","z":[[1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,0.6980392156862745,0.7686274509803922,0.9764705882352941,1.0,1.0,1.0,1.0,0.6588235294117647,0.7019607843137254,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7607843137254902,0.2980392156862745,0.00784313725490196,0.1607843137254902,0.0784313725490196,0.07450980392156863,0.06274509803921569,0.09411764705882353,0.11372549019607843,0.058823529411764705,0.17647058823529413,0.12156862745098039,0.06666666666666667,0.5803921568627451,0.9803921568627451,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.996078431372549,1.0,0.9411764705882353,0.27058823529411763,0.10980392156862745,0.12156862745098039,0.17254901960784313,0.1843137254901961,0.19607843137254902,0.2235294117647059,0.23137254901960785,0.24313725490196078,0.23529411764705882,0.21568627450980393,0.20784313725490197,0.19607843137254902,0.17647058823529413,0.10980392156862745,0.13725490196078433,0.5176470588235295,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.25882352941176473,0.10980392156862745,0.21568627450980393,0.2,0.18823529411764706,0.19215686274509805,0.1843137254901961,0.16862745098039217,0.1803921568627451,0.17647058823529413,0.18823529411764706,0.1803921568627451,0.19607843137254902,0.18823529411764706,0.18823529411764706,0.21568627450980393,0.18823529411764706,0.0784313725490196,0.6941176470588235,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.8862745098039215,0.09803921568627451,0.2,0.1803921568627451,0.18823529411764706,0.1843137254901961,0.19215686274509805,0.2,0.19607843137254902,0.20784313725490197,0.2196078431372549,0.21568627450980393,0.20392156862745098,0.21568627450980393,0.2,0.19215686274509805,0.17254901960784313,0.18823529411764706,0.10196078431372549,0.32941176470588235,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7803921568627451,0.09411764705882353,0.16470588235294117,0.16470588235294117,0.1803921568627451,0.1803921568627451,0.18823529411764706,0.19215686274509805,0.1843137254901961,0.2,0.20784313725490197,0.2,0.1843137254901961,0.19215686274509805,0.19607843137254902,0.1803921568627451,0.16470588235294117,0.17254901960784313,0.10196078431372549,0.24313725490196078,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6352941176470588,0.06274509803921569,0.1568627450980392,0.1450980392156863,0.17647058823529413,0.1843137254901961,0.19215686274509805,0.19607843137254902,0.2,0.2,0.19607843137254902,0.19607843137254902,0.18823529411764706,0.1843137254901961,0.1843137254901961,0.1843137254901961,0.1607843137254902,0.1568627450980392,0.09803921568627451,0.14901960784313725,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5568627450980392,0.07450980392156863,0.1568627450980392,0.12156862745098039,0.17254901960784313,0.2,0.19215686274509805,0.18823529411764706,0.18823529411764706,0.18823529411764706,0.18823529411764706,0.1843137254901961,0.17647058823529413,0.1843137254901961,0.2,0.1803921568627451,0.13725490196078433,0.15294117647058825,0.11764705882352941,0.08627450980392157,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4549019607843137,0.0784313725490196,0.15294117647058825,0.13725490196078433,0.1568627450980392,0.17647058823529413,0.17254901960784313,0.17254901960784313,0.17254901960784313,0.17254901960784313,0.17647058823529413,0.17254901960784313,0.17254901960784313,0.17647058823529413,0.16470588235294117,0.1411764705882353,0.1450980392156863,0.13725490196078433,0.13725490196078433,0.023529411764705882,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.35294117647058826,0.08235294117647059,0.1450980392156863,0.0392156862745098,0.09411764705882353,0.15294117647058825,0.15294117647058825,0.1607843137254902,0.1568627450980392,0.1607843137254902,0.16470588235294117,0.1568627450980392,0.1568627450980392,0.16470588235294117,0.13725490196078433,0.13725490196078433,0.0392156862745098,0.10196078431372549,0.13725490196078433,0.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.3058823529411765,0.08627450980392157,0.10980392156862745,0.13333333333333333,0.25098039215686274,0.10588235294117647,0.16470588235294117,0.1568627450980392,0.1568627450980392,0.1607843137254902,0.16470588235294117,0.16470588235294117,0.16470588235294117,0.1568627450980392,0.1411764705882353,0.13725490196078433,0.2784313725490196,0.03137254901960784,0.1411764705882353,0.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.2784313725490196,0.08627450980392157,0.09019607843137255,0.2196078431372549,0.3764705882352941,0.06274509803921569,0.17647058823529413,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.16470588235294117,0.16470588235294117,0.16862745098039217,0.17254901960784313,0.12941176470588237,0.1568627450980392,0.47843137254901963,0.0,0.1411764705882353,0.0,0.9921568627450981,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.19607843137254902,0.09019607843137255,0.0784313725490196,0.22745098039215686,0.43529411764705883,0.03137254901960784,0.1843137254901961,0.15294117647058825,0.1607843137254902,0.1607843137254902,0.1607843137254902,0.1607843137254902,0.1607843137254902,0.16862745098039217,0.1450980392156863,0.15294117647058825,0.5137254901960784,0.0,0.13333333333333333,0.0,0.9019607843137255,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.13725490196078433,0.10196078431372549,0.054901960784313725,0.27058823529411763,0.4745098039215686,0.0196078431372549,0.1803921568627451,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.1568627450980392,0.1568627450980392,0.1568627450980392,0.1607843137254902,0.12941176470588237,0.15294117647058825,0.5764705882352941,0.0,0.13725490196078433,0.0,0.8196078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.10588235294117647,0.09411764705882353,0.050980392156862744,0.27058823529411763,0.42745098039215684,0.0392156862745098,0.17647058823529413,0.16470588235294117,0.15294117647058825,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.1607843137254902,0.15294117647058825,0.13333333333333333,0.12549019607843137,0.5647058823529412,0.0,0.12549019607843137,0.0,0.7843137254901961,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.043137254901960784,0.10196078431372549,0.0392156862745098,0.34509803921568627,0.3607843137254902,0.058823529411764705,0.17647058823529413,0.16470588235294117,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.1607843137254902,0.1607843137254902,0.15294117647058825,0.15294117647058825,0.09411764705882353,0.5686274509803921,0.0,0.11764705882352941,0.0,0.6980392156862745,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.0,0.11372549019607843,0.043137254901960784,0.4235294117647059,0.2823529411764706,0.07450980392156863,0.16862745098039217,0.16862745098039217,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.1607843137254902,0.1607843137254902,0.15294117647058825,0.17647058823529413,0.043137254901960784,0.5725490196078431,0.043137254901960784,0.10196078431372549,0.0,0.6274509803921569,1.0,1.0,1.0],[1.0,1.0,1.0,0.996078431372549,0.12549019607843137,0.12941176470588237,0.0,0.5686274509803921,0.2549019607843137,0.09019607843137255,0.17254901960784313,0.1607843137254902,0.15294117647058825,0.1568627450980392,0.1568627450980392,0.1568627450980392,0.1607843137254902,0.14901960784313725,0.1803921568627451,0.011764705882352941,0.6627450980392157,0.09803921568627451,0.10196078431372549,0.00392156862745098,0.49411764705882355,1.0,1.0,1.0],[1.0,1.0,1.0,0.9607843137254902,0.10980392156862745,0.13725490196078433,0.0,0.6901960784313725,0.24705882352941178,0.08627450980392157,0.17254901960784313,0.1568627450980392,0.1568627450980392,0.1568627450980392,0.1568627450980392,0.1568627450980392,0.1607843137254902,0.14901960784313725,0.16470588235294117,0.011764705882352941,0.7725490196078432,0.10588235294117647,0.09019607843137255,0.09803921568627451,0.4627450980392157,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.11764705882352941,0.12156862745098039,0.0,0.7490196078431373,0.12156862745098039,0.12941176470588237,0.16862745098039217,0.16470588235294117,0.1607843137254902,0.1568627450980392,0.1568627450980392,0.1568627450980392,0.1568627450980392,0.14901960784313725,0.1803921568627451,0.0,0.6823529411764706,0.25882352941176473,0.06666666666666667,0.08627450980392157,0.5019607843137255,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.13333333333333333,0.09803921568627451,0.00392156862745098,1.0,0.043137254901960784,0.1568627450980392,0.16470588235294117,0.16470588235294117,0.16470588235294117,0.1607843137254902,0.1568627450980392,0.1568627450980392,0.1607843137254902,0.15294117647058825,0.1803921568627451,0.0,0.7137254901960784,0.5607843137254902,0.027450980392156862,0.09019607843137255,0.5372549019607843,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.14901960784313725,0.08627450980392157,0.0196078431372549,1.0,0.0,0.1607843137254902,0.16470588235294117,0.16470588235294117,0.16470588235294117,0.1607843137254902,0.14901960784313725,0.15294117647058825,0.16470588235294117,0.15294117647058825,0.19215686274509805,0.0,0.7215686274509804,0.7294117647058823,0.0,0.09803921568627451,0.5568627450980392,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.14901960784313725,0.07058823529411765,0.12941176470588237,1.0,0.0,0.16862745098039217,0.16470588235294117,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.15294117647058825,0.14901960784313725,0.1568627450980392,0.1411764705882353,0.1803921568627451,0.0,0.6705882352941176,0.8392156862745098,0.0,0.09803921568627451,0.5568627450980392,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.15294117647058825,0.047058823529411764,0.2823529411764706,1.0,0.0,0.17254901960784313,0.15294117647058825,0.1568627450980392,0.1568627450980392,0.1607843137254902,0.15294117647058825,0.1450980392156863,0.15294117647058825,0.15294117647058825,0.17647058823529413,0.0,0.596078431372549,0.996078431372549,0.0,0.09411764705882353,0.5686274509803921,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.14901960784313725,0.043137254901960784,0.37254901960784315,1.0,0.0,0.16862745098039217,0.15294117647058825,0.1568627450980392,0.15294117647058825,0.14901960784313725,0.15294117647058825,0.13725490196078433,0.12941176470588237,0.1450980392156863,0.15294117647058825,0.0,0.5254901960784314,1.0,0.0,0.09019607843137255,0.5450980392156862,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.15294117647058825,0.054901960784313725,0.47058823529411764,1.0,0.0,0.18823529411764706,0.17254901960784313,0.1803921568627451,0.1843137254901961,0.1803921568627451,0.18823529411764706,0.17647058823529413,0.16470588235294117,0.1607843137254902,0.17647058823529413,0.10588235294117647,0.38823529411764707,1.0,0.08235294117647059,0.10196078431372549,0.5254901960784314,1.0,1.0,1.0],[1.0,1.0,1.0,0.9019607843137255,0.09411764705882353,0.0,0.39215686274509803,0.9019607843137255,0.0,0.13333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11372549019607843,0.0,0.23529411764705882,1.0,0.0196078431372549,0.0392156862745098,0.36470588235294116,1.0,1.0,1.0],[1.0,1.0,1.0,0.996078431372549,0.3803921568627451,0.3176470588235294,0.7294117647058823,1.0,0.8313725490196079,0.7450980392156863,0.7098039215686275,0.6666666666666666,0.6313725490196078,0.611764705882353,0.6196078431372549,0.6470588235294118,0.6666666666666666,0.7058823529411765,0.7450980392156863,0.8352941176470589,1.0,1.0,0.41568627450980394,0.39215686274509803,0.7176470588235294,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x16","yaxis":"y16","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"10","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7058823529411765,0.9647058823529412,1.0,0.9882352941176471,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.9647058823529412,1.0,1.0,1.0,1.0,0.9333333333333333,0.9725490196078431,1.0,1.0,1.0,1.0,0.3803921568627451,0.9882352941176471,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9725490196078431,1.0,0.6549019607843137,0.37254901960784315,0.26666666666666666,0.13725490196078433,0.1843137254901961,0.1450980392156863,0.054901960784313725,0.01568627450980392,0.3411764705882353,0.27450980392156865,0.27450980392156865,0.4196078431372549,0.3803921568627451,1.0,1.0,0.996078431372549,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9725490196078431,1.0,0.5019607843137255,0.7686274509803922,1.0,0.9568627450980393,0.9333333333333333,0.9019607843137255,0.7529411764705882,0.7137254901960784,0.9647058823529412,0.996078431372549,1.0,1.0,0.396078431372549,0.8117647058823529,1.0,0.9882352941176471,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.6823529411764706,0.9019607843137255,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4196078431372549,0.5019607843137255,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.6666666666666666,1.0,1.0,1.0,1.0,1.0,1.0,0.9725490196078431,1.0,1.0,1.0,1.0,0.49411764705882355,0.38823529411764707,0.9098039215686274,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5568627450980392,0.5411764705882353,0.47058823529411764,0.4196078431372549,0.4196078431372549,0.47843137254901963,0.17647058823529413,0.21568627450980393,0.23529411764705882,0.4627450980392157,0.5098039215686274,0.5411764705882353,0.48627450980392156,0.48627450980392156,0.5411764705882353,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.37254901960784315,0.403921568627451,0.38823529411764707,0.42745098039215684,0.403921568627451,0.403921568627451,0.3333333333333333,0.32941176470588235,0.36470588235294116,0.27450980392156865,0.24313725490196078,0.25882352941176473,0.21568627450980393,0.49411764705882355,0.4470588235294118,0.7294117647058823,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5176470588235295,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8352941176470589,0.24313725490196078,0.6745098039215687,0.6078431372549019,0.5254901960784314,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.10196078431372549,0.34901960784313724,0.6235294117647059,0.6078431372549019,0.4470588235294118,0.8117647058823529],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.8666666666666667,0.6392156862745098,1.0,0.9647058823529412,0.996078431372549,0.9803921568627451,0.996078431372549,0.996078431372549,1.0,0.9803921568627451,1.0,0.7137254901960784,0.21568627450980393,0.3568627450980392,0.21568627450980393,0.4470588235294118,0.47058823529411764,0.6549019607843137],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.7686274509803922,0.6666666666666666,1.0,0.9882352941176471,1.0,1.0,1.0,0.996078431372549,0.996078431372549,0.9803921568627451,1.0,0.2823529411764706,0.2,0.23529411764705882,0.12156862745098039,0.36470588235294116,0.5607843137254902,0.5764705882352941],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.7607843137254902,0.7803921568627451,1.0,0.996078431372549,0.996078431372549,0.9803921568627451,1.0,1.0,1.0,1.0,1.0,0.25882352941176473,0.12156862745098039,0.1843137254901961,0.3333333333333333,0.3803921568627451,0.3215686274509804,0.48627450980392156],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.7058823529411765,0.9333333333333333,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9882352941176471,1.0,1.0,0.7372549019607844,0.3215686274509804,0.2823529411764706,0.2901960784313726,0.3215686274509804,0.3411764705882353,0.6549019607843137],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5254901960784314,0.42745098039215684,0.3137254901960784,0.6980392156862745,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0,0.6392156862745098,0.3333333333333333,0.3058823529411765,0.2980392156862745,0.5333333333333333,0.396078431372549,0.6745098039215687],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6627450980392157,0.8431372549019608,0.4627450980392157,0.4627450980392157,0.20784313725490197,0.054901960784313725,0.36470588235294116,0.9411764705882353,1.0,0.9098039215686274,0.37254901960784315,0.1843137254901961,0.20784313725490197,0.396078431372549,0.47843137254901963,0.43529411764705883,0.3215686274509804,0.7450980392156863],[1.0,1.0,1.0,1.0,0.996078431372549,0.9882352941176471,1.0,0.996078431372549,1.0,0.9803921568627451,0.5843137254901961,1.0,1.0,1.0,0.8901960784313725,0.3803921568627451,0.23529411764705882,0.22745098039215686,0.27450980392156865,0.5411764705882353,0.42745098039215684,0.2901960784313726,1.0,1.0,1.0,0.47058823529411764,0.10196078431372549,0.9098039215686274],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.9882352941176471,0.6549019607843137,1.0,0.996078431372549,1.0,1.0,1.0,1.0,0.8431372549019608,0.47843137254901963,0.396078431372549,0.396078431372549,1.0,1.0,1.0,1.0,0.47058823529411764,0.20784313725490197,0.9411764705882353],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9882352941176471,1.0,0.9019607843137255,0.7450980392156863,1.0,0.9803921568627451,0.9882352941176471,0.996078431372549,1.0,1.0,0.9882352941176471,0.4549019607843137,0.19215686274509805,0.803921568627451,1.0,0.9803921568627451,0.996078431372549,1.0,0.5568627450980392,0.21568627450980393,0.996078431372549],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9725490196078431,1.0,0.6392156862745098,0.8588235294117647,1.0,1.0,1.0,1.0,1.0,1.0,0.5254901960784314,0.16862745098039217,0.7803921568627451,1.0,0.996078431372549,0.996078431372549,0.996078431372549,1.0,0.592156862745098,0.17647058823529413,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9803921568627451,1.0,0.8666666666666667,0.6470588235294118,1.0,0.9882352941176471,0.9803921568627451,1.0,1.0,1.0,0.8941176470588236,0.2901960784313726,0.37254901960784315,1.0,0.996078431372549,0.996078431372549,0.996078431372549,0.9882352941176471,1.0,0.6549019607843137,0.1843137254901961,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8196078431372549,1.0,1.0,1.0,1.0,0.9882352941176471,0.9882352941176471,1.0,0.5411764705882353,0.2235294117647059,0.803921568627451,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.7529411764705882,0.2235294117647059,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.8352941176470589,0.6627450980392157,0.6,0.5176470588235295,0.5019607843137255,0.7294117647058823,1.0,1.0,1.0,0.8274509803921568,0.3058823529411765,0.37254901960784315,1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.7529411764705882,0.25098039215686274,1.0],[1.0,1.0,1.0,0.996078431372549,1.0,0.9490196078431372,0.5843137254901961,0.6745098039215687,0.6901960784313725,0.4196078431372549,0.32941176470588235,0.25098039215686274,0.19215686274509805,0.5176470588235295,0.9725490196078431,0.48627450980392156,0.27450980392156865,0.7529411764705882,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.7294117647058823,0.22745098039215686,1.0],[1.0,1.0,1.0,1.0,1.0,0.8745098039215686,0.6549019607843137,0.6235294117647059,0.42745098039215684,0.34901960784313724,0.21568627450980393,0.2,0.24313725490196078,0.24313725490196078,0.5607843137254902,0.4627450980392157,0.3411764705882353,1.0,1.0,1.0,1.0,1.0,1.0,0.9725490196078431,1.0,0.7137254901960784,0.2235294117647059,1.0],[1.0,1.0,1.0,1.0,1.0,0.6901960784313725,0.3215686274509804,0.20784313725490197,0.08627450980392157,0.1450980392156863,0.11372549019607843,0.13725490196078433,0.2980392156862745,0.49411764705882355,0.5254901960784314,0.26666666666666666,0.6627450980392157,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,0.9647058823529412,1.0,0.796078431372549,0.3333333333333333,1.0],[1.0,0.6313725490196078,0.16862745098039217,0.2901960784313726,0.25882352941176473,0.12156862745098039,0.3411764705882353,0.3215686274509804,0.20784313725490197,0.3568627450980392,0.34901960784313724,0.4627450980392157,0.4117647058823529,0.3058823529411765,0.1607843137254902,0.5254901960784314,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9725490196078431,1.0,0.6745098039215687,0.0,1.0],[1.0,1.0,0.8745098039215686,0.8666666666666667,0.8352941176470589,0.788235294117647,0.7607843137254902,0.7529411764705882,0.6745098039215687,0.6313725490196078,0.5686274509803921,0.5333333333333333,0.5764705882352941,0.6392156862745098,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.9647058823529412,0.5568627450980392,1.0]],"type":"heatmap","xaxis":"x17","yaxis":"y17","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"11","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8117647058823529,0.7294117647058823,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7137254901960784,0.611764705882353,0.6196078431372549,0.9137254901960784,1.0,0.996078431372549,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9882352941176471,0.996078431372549,0.996078431372549,1.0,1.0,1.0,0.8823529411764706,0.6470588235294118,0.7725490196078432,0.6862745098039216,0.6705882352941176,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7529411764705882,0.6274509803921569,0.803921568627451,0.7607843137254902,0.596078431372549,0.8901960784313725,1.0,0.996078431372549],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9882352941176471,0.803921568627451,0.788235294117647,0.8470588235294118,0.9058823529411765,0.9803921568627451,1.0,1.0,0.4196078431372549,0.30980392156862746,0.7450980392156863,0.8784313725490196,0.7215686274509804,0.596078431372549,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9137254901960784,0.6941176470588235,0.6431372549019608,0.5882352941176471,0.596078431372549,0.6196078431372549,0.5882352941176471,0.6862745098039216,0.4588235294117647,0.30980392156862746,0.43529411764705883,0.8117647058823529,0.7450980392156863,0.6274509803921569,0.9882352941176471,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9058823529411765,0.7372549019607844,0.6941176470588235,0.7450980392156863,0.796078431372549,0.7294117647058823,0.7019607843137254,0.6431372549019608,0.7647058823529411,0.5882352941176471,0.35294117647058826,0.6549019607843137,0.8823529411764706,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.996078431372549,0.7607843137254902,0.6431372549019608,0.6431372549019608,0.611764705882353,0.6039215686274509,0.596078431372549,0.6274509803921569,0.6274509803921569,0.7372549019607844,0.48627450980392156,0.24313725490196078,0.5019607843137255,0.9647058823529412,1.0,0.996078431372549,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9137254901960784,0.8823529411764706,0.9294117647058824,0.8823529411764706,0.7372549019607844,0.6549019607843137,0.6196078431372549,0.5882352941176471,0.7294117647058823,0.25882352941176473,0.0,0.30196078431372547,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7803921568627451,0.3411764705882353,0.4588235294117647,0.5294117647058824,0.5450980392156862,1.0,1.0,0.8784313725490196,0.8392156862745098],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,1.0,1.0,1.0,1.0,0.9803921568627451,0.7294117647058823,1.0,1.0,1.0,0.45098039215686275,0.35294117647058826,0.35294117647058826,0.5372549019607843],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6196078431372549,0.996078431372549,1.0,0.8627450980392157,0.5176470588235295,0.4196078431372549,0.5176470588235295,0.611764705882353,0.6941176470588235],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.996078431372549,0.996078431372549,0.9803921568627451,1.0,0.996078431372549,0.6941176470588235,0.5882352941176471,0.4196078431372549,0.44313725490196076,0.23529411764705882,0.08235294117647059,0.03137254901960784,0.611764705882353,0.596078431372549,0.6274509803921569,0.6549019607843137],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,0.9294117647058824,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.9647058823529412,0.6705882352941176,0.5450980392156862,0.40784313725490196,0.4196078431372549,0.5607843137254902,0.9215686274509803,0.8784313725490196,0.45098039215686275,0.5882352941176471,0.5764705882352941,0.596078431372549,0.6431372549019608],[0.9803921568627451,0.9490196078431372,0.996078431372549,1.0,1.0,0.9294117647058824,0.7019607843137254,0.7019607843137254,0.7372549019607844,1.0,1.0,1.0,1.0,1.0,0.9058823529411765,0.6352941176470588,0.44313725490196076,0.4196078431372549,0.7058823529411765,1.0,1.0,1.0,1.0,0.3843137254901961,0.5294117647058824,0.5843137254901961,0.5372549019607843,0.6274509803921569],[0.7058823529411765,0.4666666666666667,0.7725490196078432,0.8549019607843137,0.8705882352941177,0.9058823529411765,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7607843137254902,0.5686274509803921,0.4,0.5294117647058824,1.0,1.0,1.0,0.9568627450980393,1.0,0.9137254901960784,0.3843137254901961,0.5843137254901961,0.5882352941176471,0.5450980392156862,0.5882352941176471],[1.0,0.7058823529411765,0.5686274509803921,0.5843137254901961,0.6470588235294118,0.6470588235294118,0.44313725490196076,0.30980392156862746,0.03137254901960784,0.20784313725490197,0.8235294117647058,0.8235294117647058,0.6862745098039216,0.5254901960784314,0.3607843137254902,0.7215686274509804,1.0,1.0,0.9803921568627451,1.0,1.0,1.0,0.8196078431372549,0.3686274509803922,0.5843137254901961,0.6196078431372549,0.5254901960784314,0.5529411764705883],[1.0,1.0,0.9882352941176471,0.7529411764705882,0.43529411764705883,0.34901960784313724,0.6039215686274509,0.788235294117647,0.8784313725490196,0.996078431372549,0.7058823529411765,0.48627450980392156,0.47058823529411764,0.3686274509803922,0.8784313725490196,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.5882352941176471,0.48627450980392156,0.8117647058823529,0.7137254901960784,0.6941176470588235,0.7058823529411765],[0.996078431372549,1.0,1.0,1.0,1.0,0.9058823529411765,0.8470588235294118,0.7058823529411765,0.6549019607843137,0.6470588235294118,0.5450980392156862,0.5098039215686274,0.7529411764705882,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.7019607843137254,0.30980392156862746,0.30196078431372547,0.23529411764705882,0.23529411764705882,0.45098039215686275],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x18","yaxis":"y18","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"12","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9215686274509803,0.8235294117647058,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.8274509803921568,0.9215686274509803,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8313725490196079,0.3764705882352941,1.0,0.9764705882352941,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.3607843137254902,0.8941176470588236,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2901960784313726,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.4823529411764706,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3568627450980392,0.8745098039215686,1.0,0.984313725490196,0.984313725490196,0.984313725490196,0.984313725490196,0.984313725490196,0.9882352941176471,1.0,0.8745098039215686,0.37254901960784315,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.592156862745098,0.788235294117647,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7450980392156863,0.5725490196078431,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.611764705882353,0.25882352941176473,0.4823529411764706,0.5098039215686274,0.6039215686274509,0.611764705882353,0.596078431372549,0.5843137254901961,0.48627450980392156,0.37254901960784315,0.21568627450980393,0.5333333333333333,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5019607843137255,0.06274509803921569,0.16470588235294117,0.21568627450980393,0.1411764705882353,0.23529411764705882,0.2235294117647059,0.13333333333333333,0.17647058823529413,0.28627450980392155,0.15294117647058825,0.47843137254901963,1.0,0.9882352941176471,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4666666666666667,0.0784313725490196,0.2823529411764706,0.4,0.35294117647058826,0.36470588235294116,0.37254901960784315,0.26666666666666666,0.33725490196078434,0.34901960784313724,0.09803921568627451,0.41568627450980394,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.396078431372549,0.050980392156862744,0.21176470588235294,0.29411764705882354,0.2823529411764706,0.34901960784313724,0.26666666666666666,0.1843137254901961,0.25882352941176473,0.17647058823529413,0.08235294117647059,0.2627450980392157,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3176470588235294,0.07058823529411765,0.12941176470588237,0.19607843137254902,0.26666666666666666,0.24313725490196078,0.24705882352941178,0.26666666666666666,0.10980392156862745,0.12156862745098039,0.10588235294117647,0.15294117647058825,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.29411764705882354,0.0784313725490196,0.13725490196078433,0.10588235294117647,0.27058823529411763,0.33725490196078434,0.32941176470588235,0.17647058823529413,0.10588235294117647,0.13725490196078433,0.11372549019607843,0.12549019607843137,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.27058823529411763,0.0784313725490196,0.1411764705882353,0.09803921568627451,0.09803921568627451,0.3176470588235294,0.18823529411764706,0.07058823529411765,0.1411764705882353,0.1411764705882353,0.11764705882352941,0.0784313725490196,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.24705882352941178,0.0784313725490196,0.13725490196078433,0.12549019607843137,0.11372549019607843,0.10196078431372549,0.10196078431372549,0.12941176470588237,0.12941176470588237,0.13333333333333333,0.12549019607843137,0.07450980392156863,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.22745098039215686,0.08627450980392157,0.13725490196078433,0.12156862745098039,0.13333333333333333,0.12941176470588237,0.12549019607843137,0.12941176470588237,0.12549019607843137,0.12941176470588237,0.13333333333333333,0.03529411764705882,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.1803921568627451,0.09411764705882353,0.1450980392156863,0.12156862745098039,0.13333333333333333,0.12941176470588237,0.12549019607843137,0.12941176470588237,0.12941176470588237,0.12941176470588237,0.1411764705882353,0.13725490196078433,0.9254901960784314,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.10196078431372549,0.10980392156862745,0.13333333333333333,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12941176470588237,0.12941176470588237,0.14901960784313725,0.10980392156862745,0.8235294117647058,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0196078431372549,0.12156862745098039,0.12941176470588237,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12156862745098039,0.12549019607843137,0.13725490196078433,0.1450980392156863,0.09803921568627451,0.7176470588235294,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.12941176470588237,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12941176470588237,0.13333333333333333,0.1450980392156863,0.08627450980392157,0.5568627450980392,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8,0.0,0.13725490196078433,0.12156862745098039,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.14901960784313725,0.09411764705882353,0.4196078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6313725490196078,0.0,0.1568627450980392,0.11764705882352941,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12156862745098039,0.12941176470588237,0.1450980392156863,0.09803921568627451,0.32941176470588235,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5490196078431373,0.0,0.14901960784313725,0.12156862745098039,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.16470588235294117,0.09803921568627451,0.25098039215686274,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.45098039215686275,0.01568627450980392,0.14901960784313725,0.12941176470588237,0.12156862745098039,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12549019607843137,0.12156862745098039,0.1568627450980392,0.10980392156862745,0.1450980392156863,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3607843137254902,0.043137254901960784,0.1411764705882353,0.11764705882352941,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.13725490196078433,0.11764705882352941,0.07450980392156863,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.27450980392156865,0.07450980392156863,0.13725490196078433,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.12156862745098039,0.1411764705882353,0.12549019607843137,0.047058823529411764,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.23921568627450981,0.09019607843137255,0.12941176470588237,0.12156862745098039,0.11764705882352941,0.11764705882352941,0.11764705882352941,0.11764705882352941,0.11764705882352941,0.11764705882352941,0.11764705882352941,0.12941176470588237,0.12549019607843137,0.01568627450980392,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.25882352941176473,0.12549019607843137,0.1450980392156863,0.1568627450980392,0.1568627450980392,0.15294117647058825,0.14901960784313725,0.15294117647058825,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.1411764705882353,0.1411764705882353,0.16470588235294117,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.10196078431372549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08235294117647059,0.0784313725490196,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9607843137254902,0.6431372549019608,0.6352941176470588,0.615686274509804,0.611764705882353,0.611764705882353,0.6078431372549019,0.611764705882353,0.611764705882353,0.615686274509804,0.6352941176470588,0.6627450980392157,0.6352941176470588,0.7725490196078432,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x7","yaxis":"y7","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"13","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.9725490196078431,1.0,1.0,1.0,0.5372549019607843,0.1803921568627451,0.23529411764705882,0.2235294117647059,0.06666666666666667,1.0,1.0,1.0,1.0,0.9764705882352941,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,0.8470588235294118,0.1607843137254902,0.0,0.027450980392156862,0.00784313725490196,0.0,0.47843137254901963,0.984313725490196,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.6274509803921569,0.29411764705882354,0.0392156862745098,0.0392156862745098,0.0784313725490196,0.09019607843137255,0.0392156862745098,0.027450980392156862,0.08627450980392157,0.050980392156862744,0.050980392156862744,0.14901960784313725,0.24705882352941178,0.4470588235294118,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.396078431372549,0.0,0.058823529411764705,0.0784313725490196,0.12156862745098039,0.07450980392156863,0.023529411764705882,0.10980392156862745,0.12156862745098039,0.047058823529411764,0.07058823529411765,0.10588235294117647,0.0784313725490196,0.06666666666666667,0.0,0.39215686274509803,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.13333333333333333,0.10196078431372549,0.12549019607843137,0.11764705882352941,0.11764705882352941,0.12156862745098039,0.09803921568627451,0.10196078431372549,0.0784313725490196,0.058823529411764705,0.11764705882352941,0.10980392156862745,0.10588235294117647,0.11372549019607843,0.13333333333333333,0.06274509803921569,0.8784313725490196,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.8313725490196079,0.06274509803921569,0.12941176470588237,0.10588235294117647,0.11372549019607843,0.11372549019607843,0.10980392156862745,0.11372549019607843,0.11372549019607843,0.10980392156862745,0.10588235294117647,0.11764705882352941,0.10588235294117647,0.11372549019607843,0.11372549019607843,0.12549019607843137,0.027450980392156862,0.615686274509804,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.7843137254901961,0.0392156862745098,0.12156862745098039,0.10196078431372549,0.10196078431372549,0.11372549019607843,0.11764705882352941,0.11372549019607843,0.09803921568627451,0.11764705882352941,0.11764705882352941,0.10980392156862745,0.11372549019607843,0.11372549019607843,0.10196078431372549,0.11764705882352941,0.0196078431372549,0.4235294117647059,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.6470588235294118,0.03137254901960784,0.09803921568627451,0.054901960784313725,0.11372549019607843,0.11764705882352941,0.12549019607843137,0.11372549019607843,0.12156862745098039,0.13333333333333333,0.11764705882352941,0.10980392156862745,0.11764705882352941,0.11764705882352941,0.08627450980392157,0.06666666666666667,0.027450980392156862,0.30196078431372547,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.5333333333333333,0.027450980392156862,0.06666666666666667,0.023529411764705882,0.12156862745098039,0.10980392156862745,0.11372549019607843,0.10980392156862745,0.10588235294117647,0.11764705882352941,0.10980392156862745,0.11764705882352941,0.10980392156862745,0.12156862745098039,0.0392156862745098,0.054901960784313725,0.0392156862745098,0.2196078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.41568627450980394,0.027450980392156862,0.043137254901960784,0.023529411764705882,0.10980392156862745,0.10196078431372549,0.10980392156862745,0.10980392156862745,0.09803921568627451,0.10980392156862745,0.10196078431372549,0.10196078431372549,0.09411764705882353,0.10588235294117647,0.11372549019607843,0.0784313725490196,0.043137254901960784,0.13725490196078433,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.2980392156862745,0.023529411764705882,0.047058823529411764,0.01568627450980392,0.11372549019607843,0.09803921568627451,0.10588235294117647,0.10196078431372549,0.10980392156862745,0.12549019607843137,0.10588235294117647,0.10588235294117647,0.09411764705882353,0.09411764705882353,0.054901960784313725,0.07058823529411765,0.050980392156862744,0.0784313725490196,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.19215686274509805,0.01568627450980392,0.06274509803921569,0.03529411764705882,0.10588235294117647,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.10196078431372549,0.11372549019607843,0.09803921568627451,0.09803921568627451,0.10196078431372549,0.08235294117647059,0.21568627450980393,0.06666666666666667,0.054901960784313725,0.00784313725490196,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.10588235294117647,0.023529411764705882,0.058823529411764705,0.3411764705882353,0.0784313725490196,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.09803921568627451,0.10588235294117647,0.10196078431372549,0.09803921568627451,0.11372549019607843,0.054901960784313725,0.48627450980392156,0.0,0.058823529411764705,0.10980392156862745,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.07058823529411765,0.0392156862745098,0.0392156862745098,0.34509803921568627,0.08235294117647059,0.09019607843137255,0.10588235294117647,0.10588235294117647,0.09803921568627451,0.10980392156862745,0.10196078431372549,0.10588235294117647,0.09411764705882353,0.06666666666666667,0.3411764705882353,0.0,0.08235294117647059,0.07450980392156863,0.984313725490196,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.0,0.06666666666666667,0.0,0.5215686274509804,0.0784313725490196,0.08627450980392157,0.10980392156862745,0.10588235294117647,0.09803921568627451,0.10588235294117647,0.10196078431372549,0.10588235294117647,0.10588235294117647,0.06274509803921569,0.42745098039215684,0.0,0.09019607843137255,0.06666666666666667,0.9176470588235294,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.09411764705882353,0.07450980392156863,0.0,0.6862745098039216,0.011764705882352941,0.11372549019607843,0.11372549019607843,0.10196078431372549,0.09411764705882353,0.10980392156862745,0.10196078431372549,0.09803921568627451,0.12156862745098039,0.00784313725490196,0.48627450980392156,0.0,0.07450980392156863,0.058823529411764705,0.8470588235294118,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7450980392156863,0.06666666666666667,0.07058823529411765,0.0,0.7137254901960784,0.0,0.12549019607843137,0.10980392156862745,0.10196078431372549,0.09411764705882353,0.12941176470588237,0.09803921568627451,0.10980392156862745,0.12549019607843137,0.0,0.5176470588235295,0.043137254901960784,0.06666666666666667,0.054901960784313725,0.7215686274509804,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.596078431372549,0.06274509803921569,0.0784313725490196,0.0,0.7137254901960784,0.0,0.12941176470588237,0.10980392156862745,0.10588235294117647,0.09411764705882353,0.10980392156862745,0.09803921568627451,0.10196078431372549,0.13725490196078433,0.0,0.6039215686274509,0.2784313725490196,0.0392156862745098,0.050980392156862744,0.6078431372549019,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5450980392156862,0.07450980392156863,0.06666666666666667,0.09803921568627451,0.8235294117647058,0.0,0.13725490196078433,0.10980392156862745,0.10196078431372549,0.09803921568627451,0.10588235294117647,0.10196078431372549,0.10588235294117647,0.12941176470588237,0.0,0.5764705882352941,0.4392156862745098,0.0196078431372549,0.058823529411764705,0.5490196078431373,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5294117647058824,0.0784313725490196,0.03137254901960784,0.32941176470588235,0.8352941176470589,0.0,0.15294117647058825,0.10588235294117647,0.10196078431372549,0.09803921568627451,0.10980392156862745,0.10196078431372549,0.09803921568627451,0.12941176470588237,0.00784313725490196,0.2627450980392157,0.5882352941176471,0.0,0.06666666666666667,0.4627450980392157,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5019607843137255,0.0784313725490196,0.01568627450980392,0.596078431372549,0.7647058823529411,0.0,0.1568627450980392,0.10980392156862745,0.10196078431372549,0.09803921568627451,0.10980392156862745,0.10196078431372549,0.10196078431372549,0.10980392156862745,0.054901960784313725,0.12549019607843137,0.8156862745098039,0.0,0.07450980392156863,0.22745098039215686,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.4823529411764706,0.0784313725490196,0.0,0.8313725490196079,0.5803921568627451,0.0,0.1411764705882353,0.10588235294117647,0.10196078431372549,0.09019607843137255,0.10588235294117647,0.10196078431372549,0.10588235294117647,0.10980392156862745,0.10588235294117647,0.023529411764705882,0.6901960784313725,0.0,0.0784313725490196,0.2549019607843137,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5098039215686274,0.07058823529411765,0.0,1.0,0.30196078431372547,0.0,0.13725490196078433,0.10980392156862745,0.10980392156862745,0.09411764705882353,0.13725490196078433,0.09803921568627451,0.10980392156862745,0.10588235294117647,0.12156862745098039,0.0,0.7215686274509804,0.047058823529411764,0.047058823529411764,0.5058823529411764,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5490196078431373,0.07450980392156863,0.011764705882352941,0.996078431372549,0.23921568627450981,0.08235294117647059,0.12156862745098039,0.10588235294117647,0.10196078431372549,0.08235294117647059,0.12549019607843137,0.10196078431372549,0.11372549019607843,0.10196078431372549,0.12549019607843137,0.0,0.6470588235294118,0.2,0.03529411764705882,0.6039215686274509,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.5333333333333333,0.08627450980392157,0.011764705882352941,0.9568627450980393,0.24313725490196078,0.058823529411764705,0.13725490196078433,0.11764705882352941,0.11764705882352941,0.10980392156862745,0.12549019607843137,0.13725490196078433,0.10980392156862745,0.12549019607843137,0.09803921568627451,0.0,0.8745098039215686,0.13333333333333333,0.0,0.4,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.13333333333333333,0.023529411764705882,0.0,0.8392156862745098,0.6274509803921569,0.0,0.06666666666666667,0.058823529411764705,0.0,0.0,0.00784313725490196,0.00392156862745098,0.058823529411764705,0.07450980392156863,0.0,0.5254901960784314,1.0,0.17647058823529413,0.25882352941176473,0.592156862745098,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.788235294117647,0.5568627450980392,0.4196078431372549,0.9803921568627451,1.0,0.9686274509803922,0.6823529411764706,0.6313725490196078,0.5215686274509804,0.3607843137254902,0.3176470588235294,0.3764705882352941,0.5098039215686274,0.7372549019607844,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x8","yaxis":"y8","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"14","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,0.7725490196078432,0.4549019607843137,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6313725490196078,0.2235294117647059,0.11372549019607843,0.5450980392156862,0.7333333333333333,0.6509803921568628,0.6392156862745098,0.6588235294117647,0.7411764705882353,0.8745098039215686,0.8470588235294118,0.9372549019607843,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.9294117647058824,0.1607843137254902,0.2627450980392157,0.30196078431372547,0.3058823529411765,0.17647058823529413,0.10588235294117647,0.10588235294117647,0.08627450980392157,0.09803921568627451,0.07058823529411765,0.11764705882352941,0.058823529411764705,0.1411764705882353,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.9647058823529412,0.3215686274509804,0.2980392156862745,0.3058823529411765,0.27450980392156865,0.30196078431372547,0.2627450980392157,0.24313725490196078,0.21176470588235294,0.18823529411764706,0.15294117647058825,0.13725490196078433,0.1607843137254902,0.13333333333333333,0.24313725490196078,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.9921568627450981,0.984313725490196,1.0,1.0,1.0,0.9568627450980393,0.23137254901960785,0.16862745098039217,0.3607843137254902,0.3058823529411765,0.2549019607843137,0.20784313725490197,0.18823529411764706,0.18823529411764706,0.17254901960784313,0.15294117647058825,0.10196078431372549,0.11372549019607843,0.1450980392156863,0.14901960784313725,0.047058823529411764,1.0],[1.0,1.0,1.0,0.9882352941176471,0.9647058823529412,1.0,1.0,1.0,1.0,1.0,0.592156862745098,0.19607843137254902,0.3176470588235294,0.3607843137254902,0.25882352941176473,0.24313725490196078,0.21176470588235294,0.1843137254901961,0.1568627450980392,0.16862745098039217,0.19607843137254902,0.20392156862745098,0.19607843137254902,0.19607843137254902,0.1803921568627451,0.19215686274509805,0.0,0.8823529411764706],[1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0,0.8588235294117647,0.5137254901960784,0.29411764705882354,0.20784313725490197,0.3803921568627451,0.34509803921568627,0.24705882352941178,0.21568627450980393,0.21568627450980393,0.2196078431372549,0.2,0.16470588235294117,0.1568627450980392,0.1607843137254902,0.1568627450980392,0.15294117647058825,0.15294117647058825,0.14901960784313725,0.19215686274509805,0.0,0.6588235294117647],[1.0,1.0,1.0,1.0,0.9019607843137255,0.6235294117647059,0.2980392156862745,0.16862745098039217,0.2627450980392157,0.3333333333333333,0.3176470588235294,0.27058823529411763,0.24705882352941178,0.23529411764705882,0.23137254901960785,0.21176470588235294,0.19215686274509805,0.20784313725490197,0.1843137254901961,0.17254901960784313,0.17254901960784313,0.17647058823529413,0.16862745098039217,0.1803921568627451,0.20784313725490197,0.23137254901960785,0.10980392156862745,0.3058823529411765],[1.0,0.792156862745098,0.4823529411764706,0.29411764705882354,0.20392156862745098,0.23529411764705882,0.34509803921568627,0.44313725490196076,0.43137254901960786,0.37254901960784315,0.2549019607843137,0.28627450980392155,0.27058823529411763,0.22745098039215686,0.23921568627450981,0.2196078431372549,0.17254901960784313,0.19607843137254902,0.1803921568627451,0.17254901960784313,0.2,0.2196078431372549,0.20784313725490197,0.18823529411764706,0.18823529411764706,0.16470588235294117,0.10588235294117647,0.30196078431372547],[0.9529411764705882,0.2823529411764706,0.24705882352941178,0.30980392156862746,0.3333333333333333,0.3568627450980392,0.32941176470588235,0.29411764705882354,0.26666666666666666,0.26666666666666666,0.28627450980392155,0.27058823529411763,0.23529411764705882,0.2196078431372549,0.20392156862745098,0.17647058823529413,0.18823529411764706,0.1450980392156863,0.1843137254901961,0.1843137254901961,0.10588235294117647,0.12156862745098039,0.16862745098039217,0.14901960784313725,0.13725490196078433,0.1568627450980392,0.09803921568627451,0.3215686274509804],[0.7333333333333333,0.18823529411764706,0.2549019607843137,0.23529411764705882,0.25882352941176473,0.25882352941176473,0.27450980392156865,0.25882352941176473,0.26666666666666666,0.26666666666666666,0.2235294117647059,0.22745098039215686,0.2,0.17647058823529413,0.17254901960784313,0.1803921568627451,0.1843137254901961,0.19215686274509805,0.14901960784313725,0.12941176470588237,0.1803921568627451,0.1803921568627451,0.17254901960784313,0.14901960784313725,0.1568627450980392,0.1843137254901961,0.12941176470588237,0.3686274509803922],[0.984313725490196,0.3137254901960784,0.26666666666666666,0.21568627450980393,0.07450980392156863,0.09803921568627451,0.16862745098039217,0.2,0.19607843137254902,0.1803921568627451,0.2,0.2235294117647059,0.19607843137254902,0.21176470588235294,0.15294117647058825,0.17647058823529413,0.19215686274509805,0.14901960784313725,0.11372549019607843,0.16470588235294117,0.18823529411764706,0.15294117647058825,0.11764705882352941,0.1568627450980392,0.12941176470588237,0.13725490196078433,0.011764705882352941,0.27450980392156865],[1.0,1.0,0.984313725490196,0.8392156862745098,0.7647058823529411,0.5686274509803921,0.4823529411764706,0.396078431372549,0.32941176470588235,0.43137254901960786,0.2980392156862745,0.2823529411764706,0.27450980392156865,0.24705882352941178,0.3843137254901961,0.3058823529411765,0.27450980392156865,0.3686274509803922,0.5137254901960784,0.3137254901960784,0.3137254901960784,0.36470588235294116,0.38823529411764707,0.30980392156862746,0.32941176470588235,0.4117647058823529,0.4117647058823529,0.788235294117647],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x9","yaxis":"y9","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"15","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.6941176470588235,0.6666666666666666,0.8745098039215686,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8666666666666667,0.6549019607843137,0.5333333333333333,0.27450980392156865,0.5529411764705883,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.803921568627451,0.8588235294117647,0.803921568627451,0.4117647058823529,0.5450980392156862,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,0.9764705882352941,0.796078431372549,0.00784313725490196,0.5725490196078431,0.592156862745098,0.5803921568627451,0.7176470588235294,0.8117647058823529,0.9215686274509803,0.9882352941176471,1.0,0.8196078431372549,0.5254901960784314,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.803921568627451,0.6196078431372549,0.5607843137254902,0.5882352941176471,0.5176470588235295,0.3137254901960784,0.2823529411764706,0.28627450980392155,0.3568627450980392,0.38823529411764707,0.4392156862745098,0.5607843137254902,0.4392156862745098,0.7294117647058823,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9411764705882353,0.0,0.47843137254901963,0.5254901960784314,0.7490196078431373,0.7647058823529411,0.6745098039215687,0.5333333333333333,0.3568627450980392,0.34901960784313724,0.34901960784313724,0.41568627450980394,0.3764705882352941,0.7450980392156863,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.996078431372549,0.7019607843137254,0.8313725490196079,0.3764705882352941,0.44313725490196076,0.4392156862745098,0.5176470588235295,0.5647058823529412,0.6431372549019608,0.7215686274509804,0.7098039215686275,0.6,0.47058823529411764,0.49019607843137253,0.9764705882352941,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,0.7176470588235294,0.3843137254901961,0.45098039215686275,0.5725490196078431,0.5333333333333333,0.403921568627451,0.38823529411764707,0.4235294117647059,0.38823529411764707,0.3764705882352941,0.4627450980392157,0.4588235294117647,0.44313725490196076,0.3568627450980392,0.6549019607843137,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,0.9882352941176471,0.5450980392156862,0.3764705882352941,0.6627450980392157,0.5450980392156862,0.5725490196078431,0.5254901960784314,0.5607843137254902,0.5058823529411764,0.4117647058823529,0.3568627450980392,0.3607843137254902,0.3607843137254902,0.43137254901960786,0.5254901960784314,0.5411764705882353,0.5450980392156862,1.0],[0.996078431372549,0.996078431372549,1.0,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,0.7647058823529411,0.3215686274509804,0.7647058823529411,0.6196078431372549,0.7019607843137254,0.6745098039215687,0.615686274509804,0.6196078431372549,0.6,0.5803921568627451,0.5607843137254902,0.5882352941176471,0.5882352941176471,0.5411764705882353,0.6078431372549019,0.6352941176470588,0.6196078431372549,0.5333333333333333,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.9764705882352941,0.7764705882352941,0.403921568627451,0.6274509803921569,0.8235294117647058,0.9215686274509803,0.9215686274509803,0.7764705882352941,0.7176470588235294,0.6666666666666666,0.6627450980392157,0.7019607843137254,0.7294117647058823,0.7176470588235294,0.6941176470588235,0.6549019607843137,0.6549019607843137,0.6666666666666666,0.5882352941176471,0.5254901960784314,0.5647058823529412,0.996078431372549],[1.0,0.9764705882352941,0.8784313725490196,0.7843137254901961,0.796078431372549,0.7647058823529411,0.7019607843137254,0.7647058823529411,0.8666666666666667,0.6470588235294118,0.5058823529411764,0.43137254901960786,0.3411764705882353,0.3607843137254902,0.3843137254901961,0.41568627450980394,0.396078431372549,0.3764705882352941,0.45098039215686275,0.5882352941176471,0.7764705882352941,0.796078431372549,0.6352941176470588,0.615686274509804,0.592156862745098,0.6,0.5450980392156862,0.9058823529411765],[0.9490196078431372,0.6901960784313725,0.792156862745098,0.8392156862745098,0.8941176470588236,0.8784313725490196,0.7568627450980392,0.8235294117647058,0.9215686274509803,0.5607843137254902,0.5058823529411764,0.5803921568627451,0.6666666666666666,0.6470588235294118,0.615686274509804,0.5803921568627451,0.5607843137254902,0.5450980392156862,0.5725490196078431,0.49019607843137253,0.3764705882352941,0.47843137254901963,0.5058823529411764,0.5529411764705883,0.5803921568627451,0.6274509803921569,0.5333333333333333,0.7764705882352941],[0.8784313725490196,0.6627450980392157,0.6745098039215687,0.6901960784313725,0.7372549019607844,0.8235294117647058,0.7215686274509804,0.7647058823529411,0.8235294117647058,0.6431372549019608,0.7568627450980392,0.6196078431372549,0.5411764705882353,0.5176470588235295,0.4588235294117647,0.41568627450980394,0.41568627450980394,0.4235294117647059,0.43137254901960786,0.45098039215686275,0.3568627450980392,0.3686274509803922,0.45098039215686275,0.48627450980392156,0.4627450980392157,0.5254901960784314,0.5176470588235295,0.7372549019607844],[0.8666666666666667,0.8745098039215686,0.8666666666666667,0.8392156862745098,0.803921568627451,0.7450980392156863,0.592156862745098,0.6901960784313725,0.7725490196078432,0.45098039215686275,0.5725490196078431,0.5450980392156862,0.29411764705882354,0.3215686274509804,0.3607843137254902,0.3843137254901961,0.44313725490196076,0.4588235294117647,0.47843137254901963,0.6196078431372549,0.5882352941176471,0.6666666666666666,0.7019607843137254,0.7450980392156863,0.803921568627451,0.8117647058823529,0.7450980392156863,0.7450980392156863],[0.8196078431372549,0.7764705882352941,0.8117647058823529,0.8235294117647058,0.8745098039215686,0.8980392156862745,0.8666666666666667,0.8509803921568627,0.8666666666666667,0.6941176470588235,0.7568627450980392,0.796078431372549,0.8117647058823529,0.8392156862745098,0.8509803921568627,0.8745098039215686,0.9058823529411765,0.9333333333333333,0.9058823529411765,0.9137254901960784,0.8784313725490196,0.8745098039215686,0.8509803921568627,0.8313725490196079,0.803921568627451,0.792156862745098,0.7843137254901961,0.7843137254901961],[1.0,0.9529411764705882,0.8784313725490196,0.8666666666666667,0.803921568627451,0.7843137254901961,0.7647058823529411,0.7294117647058823,0.7294117647058823,0.7843137254901961,0.8196078431372549,0.796078431372549,0.7764705882352941,0.7725490196078432,0.7294117647058823,0.6823529411764706,0.6745098039215687,0.6901960784313725,0.6901960784313725,0.6941176470588235,0.7098039215686275,0.7176470588235294,0.7490196078431373,0.7490196078431373,0.7176470588235294,0.7215686274509804,0.7098039215686275,0.803921568627451],[1.0,1.0,1.0,1.0,1.0,1.0,0.9882352941176471,0.9607843137254902,0.8862745098039215,0.8392156862745098,0.8666666666666667,0.8666666666666667,0.8509803921568627,0.8745098039215686,0.8588235294117647,0.8509803921568627,0.8745098039215686,0.8745098039215686,0.8392156862745098,0.8509803921568627,0.8588235294117647,0.8666666666666667,0.8392156862745098,0.8392156862745098,0.8666666666666667,0.8980392156862745,0.8980392156862745,0.9882352941176471],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x10","yaxis":"y10","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"16","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6941176470588235,0.6039215686274509,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.792156862745098,0.6431372549019608,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.9882352941176471,1.0,0.9294117647058824,0.1568627450980392,0.13333333333333333,0.17254901960784313,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,0.24705882352941178,0.1450980392156863,0.19607843137254902,0.9450980392156862,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.984313725490196,0.19215686274509805,0.15294117647058825,0.2,0.13333333333333333,0.3411764705882353,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3254901960784314,0.14901960784313725,0.22745098039215686,0.1568627450980392,0.18823529411764706,0.9254901960784314,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.20392156862745098,0.11764705882352941,0.1843137254901961,0.1568627450980392,0.19607843137254902,0.12549019607843137,0.6862745098039216,1.0,1.0,1.0,1.0,1.0,1.0,0.5568627450980392,0.14901960784313725,0.21568627450980393,0.1803921568627451,0.20392156862745098,0.1568627450980392,0.2,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,0.8117647058823529,0.13333333333333333,0.1803921568627451,0.1568627450980392,0.13725490196078433,0.18823529411764706,0.1843137254901961,0.01568627450980392,0.8666666666666667,1.0,1.0,1.0,1.0,0.6392156862745098,0.0196078431372549,0.21176470588235294,0.2,0.19607843137254902,0.1568627450980392,0.19607843137254902,0.1607843137254902,0.8274509803921568,1.0,1.0,1.0],[1.0,1.0,1.0,0.27450980392156865,0.12941176470588237,0.17647058823529413,0.14901960784313725,0.12156862745098039,0.19607843137254902,0.18823529411764706,0.17254901960784313,0.023529411764705882,0.6784313725490196,1.0,1.0,0.4470588235294118,0.01568627450980392,0.21176470588235294,0.20392156862745098,0.1843137254901961,0.20784313725490197,0.1607843137254902,0.19215686274509805,0.08235294117647059,0.24705882352941178,1.0,1.0,1.0],[1.0,1.0,1.0,0.8666666666666667,0.1450980392156863,0.16470588235294117,0.1568627450980392,0.11764705882352941,0.17647058823529413,0.1803921568627451,0.18823529411764706,0.1803921568627451,0.011764705882352941,0.3333333333333333,0.1843137254901961,0.027450980392156862,0.2196078431372549,0.20784313725490197,0.19607843137254902,0.1843137254901961,0.2,0.1411764705882353,0.11764705882352941,0.3411764705882353,0.9725490196078431,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.6392156862745098,0.09803921568627451,0.11764705882352941,0.13725490196078433,0.17254901960784313,0.19607843137254902,0.1843137254901961,0.1843137254901961,0.17254901960784313,0.10196078431372549,0.14901960784313725,0.23137254901960785,0.21176470588235294,0.20784313725490197,0.20392156862745098,0.19215686274509805,0.18823529411764706,0.06274509803921569,0.7176470588235294,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.8549019607843137,0.011764705882352941,0.12156862745098039,0.19607843137254902,0.17647058823529413,0.1843137254901961,0.17254901960784313,0.1803921568627451,0.19215686274509805,0.22745098039215686,0.19215686274509805,0.20784313725490197,0.20392156862745098,0.19215686274509805,0.2196078431372549,0.12941176470588237,0.6823529411764706,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.6666666666666666,0.08627450980392157,0.19215686274509805,0.16862745098039217,0.1843137254901961,0.17254901960784313,0.17254901960784313,0.1607843137254902,0.21176470588235294,0.19215686274509805,0.19215686274509805,0.18823529411764706,0.19215686274509805,0.19607843137254902,0.13725490196078433,0.403921568627451,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.9137254901960784,0.0,0.16862745098039217,0.18823529411764706,0.1568627450980392,0.15294117647058825,0.1568627450980392,0.16862745098039217,0.21568627450980393,0.18823529411764706,0.19215686274509805,0.1607843137254902,0.1803921568627451,0.21176470588235294,0.1450980392156863,0.7019607843137254,1.0,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.5647058823529412,0.15294117647058825,0.1843137254901961,0.1607843137254902,0.15294117647058825,0.1607843137254902,0.16862745098039217,0.21568627450980393,0.18823529411764706,0.1803921568627451,0.14901960784313725,0.20784313725490197,0.1803921568627451,0.00784313725490196,0.9882352941176471,1.0,0.9882352941176471,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.01568627450980392,0.12941176470588237,0.1568627450980392,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.19607843137254902,0.1843137254901961,0.1843137254901961,0.1411764705882353,0.15294117647058825,0.1568627450980392,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.01568627450980392,0.13333333333333333,0.16862745098039217,0.11372549019607843,0.16470588235294117,0.1803921568627451,0.2235294117647059,0.21568627450980393,0.17254901960784313,0.16470588235294117,0.12549019607843137,0.21568627450980393,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.9450980392156862,0.13725490196078433,0.1803921568627451,0.17254901960784313,0.1607843137254902,0.09803921568627451,0.11764705882352941,0.11764705882352941,0.13333333333333333,0.1568627450980392,0.16862745098039217,0.1843137254901961,0.09019607843137255,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.7098039215686275,0.09411764705882353,0.18823529411764706,0.1607843137254902,0.1803921568627451,0.16470588235294117,0.15294117647058825,0.1450980392156863,0.15294117647058825,0.1843137254901961,0.1568627450980392,0.20784313725490197,0.13725490196078433,0.8666666666666667,1.0,0.984313725490196,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,0.4588235294117647,0.09019607843137255,0.1843137254901961,0.16470588235294117,0.17254901960784313,0.17254901960784313,0.17647058823529413,0.19215686274509805,0.1843137254901961,0.17647058823529413,0.15294117647058825,0.19215686274509805,0.11372549019607843,0.5450980392156862,1.0,0.9803921568627451,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.19215686274509805,0.12941176470588237,0.17647058823529413,0.16470588235294117,0.17254901960784313,0.16470588235294117,0.1607843137254902,0.17647058823529413,0.17647058823529413,0.1803921568627451,0.17254901960784313,0.17254901960784313,0.15294117647058825,0.21568627450980393,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,0.21176470588235294,0.15294117647058825,0.1843137254901961,0.16862745098039217,0.17647058823529413,0.16862745098039217,0.16470588235294117,0.17254901960784313,0.16862745098039217,0.16862745098039217,0.1607843137254902,0.17647058823529413,0.1803921568627451,0.22745098039215686,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.9333333333333333,0.15294117647058825,0.16470588235294117,0.17254901960784313,0.16470588235294117,0.16470588235294117,0.16470588235294117,0.1607843137254902,0.16470588235294117,0.17254901960784313,0.16862745098039217,0.17647058823529413,0.17254901960784313,0.19215686274509805,0.1803921568627451,0.8352941176470589,1.0,0.9764705882352941,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.8274509803921568,0.12156862745098039,0.17254901960784313,0.16470588235294117,0.16470588235294117,0.1568627450980392,0.16470588235294117,0.1607843137254902,0.1607843137254902,0.17647058823529413,0.16862745098039217,0.17254901960784313,0.17647058823529413,0.19215686274509805,0.1803921568627451,0.788235294117647,1.0,0.984313725490196,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.7843137254901961,0.11372549019607843,0.17647058823529413,0.16470588235294117,0.16470588235294117,0.1568627450980392,0.16470588235294117,0.1607843137254902,0.1607843137254902,0.17647058823529413,0.16470588235294117,0.17647058823529413,0.1803921568627451,0.1803921568627451,0.16862745098039217,0.807843137254902,1.0,0.9882352941176471,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.7411764705882353,0.10980392156862745,0.17647058823529413,0.1568627450980392,0.16862745098039217,0.1568627450980392,0.16470588235294117,0.1607843137254902,0.1607843137254902,0.17647058823529413,0.16862745098039217,0.17647058823529413,0.1568627450980392,0.1843137254901961,0.1568627450980392,0.788235294117647,1.0,0.984313725490196,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.7254901960784313,0.10196078431372549,0.17254901960784313,0.15294117647058825,0.16862745098039217,0.1568627450980392,0.15294117647058825,0.1607843137254902,0.14901960784313725,0.17254901960784313,0.16470588235294117,0.16470588235294117,0.17254901960784313,0.18823529411764706,0.15294117647058825,0.6823529411764706,1.0,0.9882352941176471,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.5843137254901961,0.10588235294117647,0.1803921568627451,0.1450980392156863,0.16470588235294117,0.1568627450980392,0.14901960784313725,0.1607843137254902,0.15294117647058825,0.17647058823529413,0.16862745098039217,0.16470588235294117,0.16862745098039217,0.17647058823529413,0.13725490196078433,0.6509803921568628,1.0,0.9882352941176471,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.5176470588235295,0.12156862745098039,0.19215686274509805,0.17254901960784313,0.16862745098039217,0.1568627450980392,0.14901960784313725,0.15294117647058825,0.1450980392156863,0.1803921568627451,0.1803921568627451,0.17647058823529413,0.18823529411764706,0.2,0.1568627450980392,0.6549019607843137,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.9921568627450981,1.0,0.43137254901960786,0.054901960784313725,0.1450980392156863,0.09019607843137255,0.12156862745098039,0.1450980392156863,0.14901960784313725,0.1411764705882353,0.12941176470588237,0.1450980392156863,0.13333333333333333,0.0784313725490196,0.050980392156862744,0.1411764705882353,0.09411764705882353,0.5607843137254902,1.0,0.9725490196078431,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.8274509803921568,0.5686274509803921,0.4745098039215686,0.2980392156862745,0.22745098039215686,0.17647058823529413,0.1607843137254902,0.16862745098039217,0.16470588235294117,0.19607843137254902,0.2549019607843137,0.3568627450980392,0.5098039215686274,0.6549019607843137,0.6549019607843137,0.9607843137254902,1.0,0.984313725490196,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x11","yaxis":"y11","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"17","z":[[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.9921568627450981,1.0,0.9725490196078431,0.047058823529411764,0.19215686274509805,0.24313725490196078,0.2235294117647059,0.09019607843137255,0.9882352941176471,1.0,0.9921568627450981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,0.996078431372549,0.996078431372549,1.0,1.0,1.0,0.33725490196078434,0.09803921568627451,0.047058823529411764,0.10196078431372549,0.07058823529411765,0.09019607843137255,0.3254901960784314,1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,0.8156862745098039,0.30196078431372547,0.2196078431372549,0.1450980392156863,0.058823529411764705,0.011764705882352941,0.0392156862745098,0.14901960784313725,0.24705882352941178,0.2784313725490196,0.8509803921568627,1.0,1.0,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.996078431372549,1.0,1.0,0.7294117647058823,0.3215686274509804,0.25882352941176473,0.24705882352941178,0.11372549019607843,0.09803921568627451,0.23529411764705882,0.07450980392156863,0.1411764705882353,0.058823529411764705,0.11764705882352941,0.25882352941176473,0.13725490196078433,0.3254901960784314,0.7803921568627451,1.0,1.0,0.996078431372549,1.0,1.0,1.0],[1.0,1.0,1.0,0.9921568627450981,1.0,0.9882352941176471,0.3803921568627451,0.2235294117647059,0.24705882352941178,0.28627450980392155,0.2784313725490196,0.2627450980392157,0.25098039215686274,0.22745098039215686,0.19607843137254902,0.19607843137254902,0.24313725490196078,0.21568627450980393,0.25882352941176473,0.28627450980392155,0.25098039215686274,0.23137254901960785,0.3843137254901961,0.9882352941176471,1.0,0.9921568627450981,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.47058823529411764,0.17254901960784313,0.2823529411764706,0.27058823529411763,0.27058823529411763,0.27450980392156865,0.2627450980392157,0.24705882352941178,0.23529411764705882,0.2235294117647059,0.2784313725490196,0.24705882352941178,0.2784313725490196,0.2823529411764706,0.27058823529411763,0.2784313725490196,0.29411764705882354,0.17647058823529413,0.3843137254901961,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.2980392156862745,0.2235294117647059,0.2549019607843137,0.29411764705882354,0.27058823529411763,0.2823529411764706,0.2627450980392157,0.25098039215686274,0.20784313725490197,0.2,0.25882352941176473,0.25098039215686274,0.2627450980392157,0.2784313725490196,0.2627450980392157,0.2627450980392157,0.2627450980392157,0.23529411764705882,0.2627450980392157,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.7764705882352941,0.20784313725490197,0.22745098039215686,0.22745098039215686,0.27058823529411763,0.27058823529411763,0.27058823529411763,0.2627450980392157,0.27058823529411763,0.23137254901960785,0.2549019607843137,0.2784313725490196,0.27058823529411763,0.2823529411764706,0.2823529411764706,0.27450980392156865,0.29411764705882354,0.25098039215686274,0.23137254901960785,0.20784313725490197,0.7411764705882353,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.49411764705882355,0.2,0.2196078431372549,0.19215686274509805,0.28627450980392155,0.2823529411764706,0.2627450980392157,0.27058823529411763,0.27450980392156865,0.27450980392156865,0.27058823529411763,0.2627450980392157,0.25882352941176473,0.25882352941176473,0.27058823529411763,0.2549019607843137,0.27058823529411763,0.2,0.2196078431372549,0.19607843137254902,0.48627450980392156,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.2549019607843137,0.24313725490196078,0.2196078431372549,0.16470588235294117,0.25882352941176473,0.27450980392156865,0.2627450980392157,0.25882352941176473,0.2627450980392157,0.2627450980392157,0.2549019607843137,0.25098039215686274,0.24313725490196078,0.24313725490196078,0.2549019607843137,0.24705882352941178,0.2549019607843137,0.19215686274509805,0.2196078431372549,0.2235294117647059,0.2235294117647059,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,0.12156862745098039,0.2627450980392157,0.21568627450980393,0.16470588235294117,0.29411764705882354,0.25098039215686274,0.2549019607843137,0.2549019607843137,0.25882352941176473,0.2549019607843137,0.24705882352941178,0.24313725490196078,0.23529411764705882,0.24313725490196078,0.25882352941176473,0.25098039215686274,0.25882352941176473,0.17647058823529413,0.21568627450980393,0.24705882352941178,0.10980392156862745,1.0,1.0,1.0],[1.0,1.0,1.0,0.8313725490196079,0.21568627450980393,0.2627450980392157,0.22745098039215686,0.15294117647058825,0.2980392156862745,0.24705882352941178,0.25098039215686274,0.25098039215686274,0.25098039215686274,0.2549019607843137,0.25098039215686274,0.24705882352941178,0.24705882352941178,0.23529411764705882,0.25098039215686274,0.23137254901960785,0.2549019607843137,0.1411764705882353,0.20392156862745098,0.2549019607843137,0.20392156862745098,0.803921568627451,1.0,1.0],[1.0,1.0,1.0,0.4627450980392157,0.16470588235294117,0.2627450980392157,0.2235294117647059,0.17647058823529413,0.29411764705882354,0.23137254901960785,0.2549019607843137,0.2549019607843137,0.25098039215686274,0.2627450980392157,0.25098039215686274,0.2549019607843137,0.24705882352941178,0.24313725490196078,0.24313725490196078,0.2235294117647059,0.2549019607843137,0.1450980392156863,0.19215686274509805,0.24705882352941178,0.15294117647058825,0.43529411764705883,1.0,1.0],[1.0,1.0,1.0,0.7725490196078432,0.4117647058823529,0.27450980392156865,0.1450980392156863,0.10980392156862745,0.2549019607843137,0.2549019607843137,0.25882352941176473,0.24313725490196078,0.2549019607843137,0.25882352941176473,0.24705882352941178,0.2549019607843137,0.24313725490196078,0.24705882352941178,0.23137254901960785,0.22745098039215686,0.24313725490196078,0.0784313725490196,0.10980392156862745,0.25882352941176473,0.33725490196078434,0.8,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9921568627450981,0.9882352941176471,0.7843137254901961,0.1411764705882353,0.27450980392156865,0.24705882352941178,0.23137254901960785,0.25882352941176473,0.25098039215686274,0.24313725490196078,0.25098039215686274,0.23529411764705882,0.23137254901960785,0.24705882352941178,0.24705882352941178,0.09019607843137255,0.7411764705882353,0.9921568627450981,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9647058823529412,0.00392156862745098,0.25098039215686274,0.25098039215686274,0.23137254901960785,0.2627450980392157,0.25098039215686274,0.23529411764705882,0.24705882352941178,0.24313725490196078,0.22745098039215686,0.2549019607843137,0.24313725490196078,0.00392156862745098,0.984313725490196,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.984313725490196,1.0,0.8392156862745098,0.17647058823529413,0.25098039215686274,0.22745098039215686,0.22745098039215686,0.2549019607843137,0.24705882352941178,0.23529411764705882,0.24313725490196078,0.24313725490196078,0.24313725490196078,0.22745098039215686,0.24705882352941178,0.17647058823529413,0.8509803921568627,1.0,0.9803921568627451,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8549019607843137,0.1803921568627451,0.25882352941176473,0.2196078431372549,0.22745098039215686,0.2549019607843137,0.23529411764705882,0.24313725490196078,0.24313725490196078,0.24313725490196078,0.23137254901960785,0.22745098039215686,0.25882352941176473,0.20784313725490197,0.8627450980392157,1.0,0.9882352941176471,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9725490196078431,1.0,0.8549019607843137,0.19215686274509805,0.2627450980392157,0.2235294117647059,0.24705882352941178,0.27450980392156865,0.24313725490196078,0.24313725490196078,0.24705882352941178,0.24705882352941178,0.23137254901960785,0.23137254901960785,0.27058823529411763,0.2235294117647059,0.8549019607843137,1.0,0.9803921568627451,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.8274509803921568,0.2,0.2627450980392157,0.22745098039215686,0.23529411764705882,0.25882352941176473,0.22745098039215686,0.24313725490196078,0.23529411764705882,0.23529411764705882,0.23529411764705882,0.2235294117647059,0.2549019607843137,0.20392156862745098,0.8313725490196079,1.0,0.9803921568627451,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.807843137254902,0.20784313725490197,0.25882352941176473,0.23529411764705882,0.23137254901960785,0.25098039215686274,0.2235294117647059,0.23529411764705882,0.22745098039215686,0.22745098039215686,0.24313725490196078,0.22745098039215686,0.2549019607843137,0.18823529411764706,0.7843137254901961,1.0,0.9803921568627451,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.803921568627451,0.21568627450980393,0.25882352941176473,0.23529411764705882,0.24705882352941178,0.24313725490196078,0.2235294117647059,0.23137254901960785,0.22745098039215686,0.2235294117647059,0.24705882352941178,0.22745098039215686,0.2627450980392157,0.1803921568627451,0.7294117647058823,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.8,0.2196078431372549,0.2549019607843137,0.24313725490196078,0.2549019607843137,0.22745098039215686,0.2235294117647059,0.23137254901960785,0.22745098039215686,0.2196078431372549,0.24705882352941178,0.25098039215686274,0.2549019607843137,0.19215686274509805,0.7098039215686275,1.0,0.9882352941176471,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.7529411764705882,0.2235294117647059,0.25882352941176473,0.2549019607843137,0.25098039215686274,0.24313725490196078,0.23137254901960785,0.22745098039215686,0.2235294117647059,0.2235294117647059,0.23529411764705882,0.2627450980392157,0.2627450980392157,0.19215686274509805,0.6549019607843137,1.0,0.9921568627450981,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.7450980392156863,0.2196078431372549,0.25882352941176473,0.25098039215686274,0.24313725490196078,0.24313725490196078,0.22745098039215686,0.2196078431372549,0.21568627450980393,0.21568627450980393,0.22745098039215686,0.25098039215686274,0.25882352941176473,0.18823529411764706,0.592156862745098,1.0,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.7490196078431373,0.2196078431372549,0.27450980392156865,0.27058823529411763,0.25882352941176473,0.2549019607843137,0.24705882352941178,0.24313725490196078,0.23529411764705882,0.23529411764705882,0.25098039215686274,0.2549019607843137,0.27058823529411763,0.2,0.592156862745098,1.0,0.996078431372549,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9803921568627451,1.0,0.6784313725490196,0.16862745098039217,0.25098039215686274,0.2196078431372549,0.20392156862745098,0.20784313725490197,0.19215686274509805,0.17647058823529413,0.16862745098039217,0.16862745098039217,0.18823529411764706,0.17647058823529413,0.23137254901960785,0.15294117647058825,0.5568627450980392,1.0,0.984313725490196,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,0.9882352941176471,1.0,0.8470588235294118,0.36470588235294116,0.3607843137254902,0.3411764705882353,0.3254901960784314,0.3137254901960784,0.3215686274509804,0.3333333333333333,0.3333333333333333,0.32941176470588235,0.3411764705882353,0.3607843137254902,0.37254901960784315,0.3764705882352941,0.7725490196078432,1.0,0.9882352941176471,0.996078431372549,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x12","yaxis":"y12","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"18","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3686274509803922,1.0,1.0,1.0,0.6549019607843137,0.8823529411764706,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.35294117647058826,0.796078431372549,1.0,1.0,0.35294117647058826,0.9490196078431372,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4117647058823529,0.6196078431372549,1.0,0.9803921568627451,0.21176470588235294,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4470588235294118,0.36470588235294116,0.8235294117647058,0.6705882352941176,0.3803921568627451,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4470588235294118,0.3568627450980392,0.3176470588235294,0.33725490196078434,0.4980392156862745,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4627450980392157,0.42745098039215684,0.23137254901960785,0.17647058823529413,0.5529411764705883,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3843137254901961,0.396078431372549,0.24705882352941178,0.21568627450980393,0.5529411764705883,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.30980392156862746,0.38823529411764707,0.41568627450980394,0.4627450980392157,0.4980392156862745,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.803921568627451,0.17647058823529413,0.3058823529411765,0.5137254901960784,0.6627450980392157,0.5137254901960784,0.807843137254902,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.33725490196078434,0.17647058823529413,0.2235294117647059,0.30196078431372547,0.29411764705882354,0.34901960784313724,0.5098039215686274,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4,0.41568627450980394,0.3254901960784314,0.3411764705882353,0.5843137254901961,0.5098039215686274,0.6862745098039216,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5450980392156862,0.49019607843137253,0.30980392156862746,0.4980392156862745,0.6666666666666666,0.5843137254901961,0.8274509803921568,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4470588235294118,0.5529411764705883,0.5058823529411764,0.16470588235294117,0.44313725490196076,0.592156862745098,0.8117647058823529,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5764705882352941,0.6862745098039216,0.6313725490196078,0.5137254901960784,0.2901960784313726,0.7411764705882353,0.8980392156862745,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6549019607843137,0.6823529411764706,0.7019607843137254,0.8352941176470589,0.19607843137254902,0.7333333333333333,0.8745098039215686,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7254901960784313,0.6039215686274509,0.7725490196078432,0.9490196078431372,0.23137254901960785,0.7411764705882353,0.8745098039215686,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.796078431372549,0.5450980392156862,0.7098039215686275,1.0,0.2784313725490196,0.6980392156862745,0.9058823529411765,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7803921568627451,0.4980392156862745,0.6509803921568628,1.0,0.25882352941176473,0.6980392156862745,0.9803921568627451,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.796078431372549,0.4980392156862745,0.7450980392156863,1.0,0.35294117647058826,0.6862745098039216,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7607843137254902,0.4,0.8745098039215686,1.0,0.4588235294117647,0.6784313725490196,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.47843137254901963,0.00392156862745098,0.9333333333333333,1.0,0.3215686274509804,0.5529411764705883,0.996078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4,0.2784313725490196,0.996078431372549,1.0,0.33725490196078434,0.49019607843137253,0.9686274509803922,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5607843137254902,0.3254901960784314,0.9372549019607843,1.0,0.3333333333333333,0.5372549019607843,0.9686274509803922,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7176470588235294,0.3411764705882353,0.8745098039215686,1.0,0.3333333333333333,0.4980392156862745,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.796078431372549,0.25882352941176473,0.9176470588235294,1.0,0.4470588235294118,0.5294117647058824,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8666666666666667,0.25882352941176473,0.8705882352941177,1.0,0.403921568627451,0.49411764705882355,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8235294117647058,0.17647058823529413,0.9294117647058824,1.0,0.4980392156862745,0.4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8745098039215686,0.1607843137254902,0.8823529411764706,1.0,0.6235294117647059,0.4196078431372549,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x","yaxis":"y","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"},{"coloraxis":"coloraxis","name":"19","z":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3058823529411765,0.25098039215686274,0.3137254901960784,0.2980392156862745,0.2980392156862745,0.2980392156862745,0.2980392156862745,0.3215686274509804,0.19607843137254902,0.5372549019607843,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.00784313725490196,0.07450980392156863,0.0784313725490196,0.07450980392156863,0.07450980392156863,0.0784313725490196,0.07450980392156863,0.10588235294117647,0.050980392156862744,0.0784313725490196,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7019607843137254,0.011764705882352941,0.19215686274509805,0.15294117647058825,0.15294117647058825,0.14901960784313725,0.1568627450980392,0.1607843137254902,0.1607843137254902,0.18823529411764706,0.00784313725490196,0.9333333333333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5450980392156862,0.01568627450980392,0.1803921568627451,0.1411764705882353,0.14901960784313725,0.1411764705882353,0.14901960784313725,0.1607843137254902,0.13333333333333333,0.19607843137254902,0.00784313725490196,0.7098039215686275,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2823529411764706,0.08235294117647059,0.16862745098039217,0.1411764705882353,0.1450980392156863,0.14901960784313725,0.1607843137254902,0.1607843137254902,0.13333333333333333,0.19607843137254902,0.0196078431372549,0.5215686274509804,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.19215686274509805,0.12156862745098039,0.1568627450980392,0.1411764705882353,0.1450980392156863,0.1450980392156863,0.15294117647058825,0.1450980392156863,0.13725490196078433,0.17647058823529413,0.058823529411764705,0.3843137254901961,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.24705882352941178,0.11372549019607843,0.14901960784313725,0.15294117647058825,0.1568627450980392,0.13333333333333333,0.1450980392156863,0.1450980392156863,0.1450980392156863,0.17254901960784313,0.09411764705882353,0.29411764705882354,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2627450980392157,0.11764705882352941,0.15294117647058825,0.14901960784313725,0.1568627450980392,0.13333333333333333,0.13725490196078433,0.16470588235294117,0.15294117647058825,0.15294117647058825,0.13725490196078433,0.24313725490196078,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.26666666666666666,0.09803921568627451,0.15294117647058825,0.14901960784313725,0.1568627450980392,0.10196078431372549,0.09411764705882353,0.1607843137254902,0.1607843137254902,0.1450980392156863,0.13333333333333333,0.23137254901960785,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2901960784313726,0.09803921568627451,0.16470588235294117,0.16470588235294117,0.09019607843137255,0.24313725490196078,0.32941176470588235,0.0784313725490196,0.1568627450980392,0.1607843137254902,0.11764705882352941,0.23529411764705882,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.32941176470588235,0.09803921568627451,0.16862745098039217,0.17254901960784313,0.023529411764705882,0.37254901960784315,0.44313725490196076,0.0196078431372549,0.17647058823529413,0.16470588235294117,0.10588235294117647,0.2784313725490196,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.35294117647058826,0.08235294117647059,0.17647058823529413,0.1803921568627451,0.00784313725490196,0.5686274509803921,0.611764705882353,0.00784313725490196,0.1843137254901961,0.16470588235294117,0.08627450980392157,0.32941176470588235,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3764705882352941,0.06666666666666667,0.17647058823529413,0.19215686274509805,0.0,0.8705882352941177,0.9333333333333333,0.0,0.1843137254901961,0.16470588235294117,0.043137254901960784,0.4235294117647059,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4117647058823529,0.058823529411764705,0.1803921568627451,0.17254901960784313,0.0,1.0,1.0,0.0,0.17254901960784313,0.16862745098039217,0.011764705882352941,0.5019607843137255,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.44313725490196076,0.054901960784313725,0.17647058823529413,0.13333333333333333,0.00784313725490196,1.0,1.0,0.00784313725490196,0.12941176470588237,0.16470588235294117,0.011764705882352941,0.5450980392156862,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.49411764705882355,0.027450980392156862,0.17647058823529413,0.12941176470588237,0.03137254901960784,1.0,1.0,0.0392156862745098,0.11764705882352941,0.14901960784313725,0.011764705882352941,0.5843137254901961,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5607843137254902,0.01568627450980392,0.17254901960784313,0.11372549019607843,0.13725490196078433,1.0,1.0,0.10980392156862745,0.10588235294117647,0.1607843137254902,0.00784313725490196,0.6431372549019608,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6,0.011764705882352941,0.16862745098039217,0.09411764705882353,0.29411764705882354,1.0,1.0,0.20392156862745098,0.08627450980392157,0.15294117647058825,0.00784313725490196,0.7725490196078432,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6666666666666666,0.00784313725490196,0.16470588235294117,0.08235294117647059,0.38823529411764707,1.0,1.0,0.25882352941176473,0.08235294117647059,0.1450980392156863,0.00784313725490196,0.9215686274509803,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7372549019607844,0.00784313725490196,0.16862745098039217,0.07450980392156863,0.38823529411764707,1.0,1.0,0.2784313725490196,0.08235294117647059,0.13333333333333333,0.00784313725490196,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7803921568627451,0.00784313725490196,0.16862745098039217,0.07058823529411765,0.39215686274509803,1.0,1.0,0.26666666666666666,0.08627450980392157,0.12549019607843137,0.01568627450980392,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8156862745098039,0.00784313725490196,0.1607843137254902,0.06274509803921569,0.43137254901960786,1.0,1.0,0.28627450980392155,0.08627450980392157,0.11764705882352941,0.08235294117647059,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8823529411764706,0.00784313725490196,0.14901960784313725,0.058823529411764705,0.4745098039215686,1.0,1.0,0.3137254901960784,0.08235294117647059,0.09411764705882353,0.2235294117647059,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9764705882352941,0.00784313725490196,0.1450980392156863,0.054901960784313725,0.49019607843137253,1.0,1.0,0.2784313725490196,0.09019607843137255,0.07450980392156863,0.39215686274509803,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.011764705882352941,0.13333333333333333,0.050980392156862744,0.5333333333333333,1.0,1.0,0.25882352941176473,0.09411764705882353,0.06666666666666667,0.47843137254901963,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.03529411764705882,0.12156862745098039,0.050980392156862744,0.5137254901960784,1.0,1.0,0.14901960784313725,0.10196078431372549,0.054901960784313725,0.4627450980392157,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.09803921568627451,0.043137254901960784,0.054901960784313725,0.7215686274509804,1.0,1.0,0.36470588235294116,0.043137254901960784,0.08235294117647059,0.7294117647058823,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8627450980392157,0.25098039215686274,0.45098039215686275,1.0,1.0,1.0,1.0,0.27450980392156865,0.4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],"type":"heatmap","xaxis":"x2","yaxis":"y2","hovertemplate":"x: %{x}\u003cbr\u003ey: %{y}\u003cbr\u003ecolor: %{z}\u003cextra\u003e\u003c\u002fextra\u003e"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,0.15],"scaleanchor":"y","constrain":"domain"},"yaxis":{"anchor":"x","domain":[0.0,0.1975],"autorange":"reversed","constrain":"domain"},"xaxis2":{"anchor":"y2","domain":[0.16999999999999998,0.31999999999999995],"matches":"x"},"yaxis2":{"anchor":"x2","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis3":{"anchor":"y3","domain":[0.33999999999999997,0.49],"matches":"x"},"yaxis3":{"anchor":"x3","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis4":{"anchor":"y4","domain":[0.51,0.66],"matches":"x"},"yaxis4":{"anchor":"x4","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis5":{"anchor":"y5","domain":[0.6799999999999999,0.83],"matches":"x"},"yaxis5":{"anchor":"x5","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis6":{"anchor":"y6","domain":[0.85,1.0],"matches":"x"},"yaxis6":{"anchor":"x6","domain":[0.0,0.1975],"matches":"y","showticklabels":false},"xaxis7":{"anchor":"y7","domain":[0.0,0.15],"matches":"x","showticklabels":false},"yaxis7":{"anchor":"x7","domain":[0.2675,0.465],"matches":"y"},"xaxis8":{"anchor":"y8","domain":[0.16999999999999998,0.31999999999999995],"matches":"x","showticklabels":false},"yaxis8":{"anchor":"x8","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis9":{"anchor":"y9","domain":[0.33999999999999997,0.49],"matches":"x","showticklabels":false},"yaxis9":{"anchor":"x9","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis10":{"anchor":"y10","domain":[0.51,0.66],"matches":"x","showticklabels":false},"yaxis10":{"anchor":"x10","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis11":{"anchor":"y11","domain":[0.6799999999999999,0.83],"matches":"x","showticklabels":false},"yaxis11":{"anchor":"x11","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis12":{"anchor":"y12","domain":[0.85,1.0],"matches":"x","showticklabels":false},"yaxis12":{"anchor":"x12","domain":[0.2675,0.465],"matches":"y","showticklabels":false},"xaxis13":{"anchor":"y13","domain":[0.0,0.15],"matches":"x","showticklabels":false},"yaxis13":{"anchor":"x13","domain":[0.535,0.7325],"matches":"y"},"xaxis14":{"anchor":"y14","domain":[0.16999999999999998,0.31999999999999995],"matches":"x","showticklabels":false},"yaxis14":{"anchor":"x14","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis15":{"anchor":"y15","domain":[0.33999999999999997,0.49],"matches":"x","showticklabels":false},"yaxis15":{"anchor":"x15","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis16":{"anchor":"y16","domain":[0.51,0.66],"matches":"x","showticklabels":false},"yaxis16":{"anchor":"x16","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis17":{"anchor":"y17","domain":[0.6799999999999999,0.83],"matches":"x","showticklabels":false},"yaxis17":{"anchor":"x17","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis18":{"anchor":"y18","domain":[0.85,1.0],"matches":"x","showticklabels":false},"yaxis18":{"anchor":"x18","domain":[0.535,0.7325],"matches":"y","showticklabels":false},"xaxis19":{"anchor":"y19","domain":[0.0,0.15],"matches":"x","showticklabels":false},"yaxis19":{"anchor":"x19","domain":[0.8025,1.0],"matches":"y"},"xaxis20":{"anchor":"y20","domain":[0.16999999999999998,0.31999999999999995],"matches":"x","showticklabels":false},"yaxis20":{"anchor":"x20","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"xaxis21":{"anchor":"y21","domain":[0.33999999999999997,0.49],"matches":"x","showticklabels":false},"yaxis21":{"anchor":"x21","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"xaxis22":{"anchor":"y22","domain":[0.51,0.66],"matches":"x","showticklabels":false},"yaxis22":{"anchor":"x22","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"xaxis23":{"anchor":"y23","domain":[0.6799999999999999,0.83],"matches":"x","showticklabels":false},"yaxis23":{"anchor":"x23","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"xaxis24":{"anchor":"y24","domain":[0.85,1.0],"matches":"x","showticklabels":false},"yaxis24":{"anchor":"x24","domain":[0.8025,1.0],"matches":"y","showticklabels":false},"annotations":[{"font":{},"showarrow":false,"text":"Trouser","x":0.075,"xanchor":"center","xref":"paper","y":0.1975,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Trouser","x":0.24499999999999997,"xanchor":"center","xref":"paper","y":0.1975,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Shirt","x":0.075,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Shirt","x":0.24499999999999997,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Sneaker","x":0.415,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Sneaker","x":0.585,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"T-shirt\u002ftop","x":0.7549999999999999,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"T-shirt\u002ftop","x":0.925,"xanchor":"center","xref":"paper","y":0.465,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Dress","x":0.075,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Dress","x":0.24499999999999997,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Pullover","x":0.415,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Pullover","x":0.585,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Sandal","x":0.7549999999999999,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Sandal","x":0.925,"xanchor":"center","xref":"paper","y":0.7325,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Ankle boot","x":0.075,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Ankle boot","x":0.24499999999999997,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Bag","x":0.415,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Bag","x":0.585,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Coat","x":0.7549999999999999,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{},"showarrow":false,"text":"Coat","x":0.925,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"coloraxis":{"colorscale":[[0.0,"rgb(0, 0, 0)"],[0.09090909090909091,"rgb(16, 16, 16)"],[0.18181818181818182,"rgb(38, 38, 38)"],[0.2727272727272727,"rgb(59, 59, 59)"],[0.36363636363636365,"rgb(81, 80, 80)"],[0.45454545454545453,"rgb(102, 101, 101)"],[0.5454545454545454,"rgb(124, 123, 122)"],[0.6363636363636364,"rgb(146, 146, 145)"],[0.7272727272727273,"rgb(171, 171, 170)"],[0.8181818181818182,"rgb(197, 197, 195)"],[0.9090909090909091,"rgb(224, 224, 223)"],[1.0,"rgb(254, 254, 253)"]],"showscale":false},"height":880},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('a19f4cd8-6d12-42a3-961e-bbb31c499978');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<section id="raw-data" class="level3" data-number="25.7.1">
+<h3 data-number="25.7.1" class="anchored" data-anchor-id="raw-data"><span class="header-section-number">25.7.1</span> Raw Data</h3>
+<p>As we can see, each 28x28 pixel image is labelled by the category of clothing it belongs to. Us humans can very easily look at these images and identify the type of clothing being displayed, even if the image is a little blurry. However, this task is less intuitive for machine learning models. To illustrate this, let’s take a small sample of the training data to see how the images above are represented in their raw format:</p>
+<div id="352e784e" class="cell" data-execution_count="27">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>images.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="27">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">images</th>
+<th data-quarto-table-cell-role="th">labels</th>
+<th data-quarto-table-cell-role="th">class</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...</td>
+<td>3</td>
+<td>Dress</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...</td>
+<td>4</td>
+<td>Coat</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...</td>
+<td>0</td>
+<td>T-shirt/top</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>[[1.0, 1.0, 1.0, 1.0, 1.0, 0.996078431372549, ...</td>
+<td>2</td>
+<td>Pullover</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...</td>
+<td>1</td>
+<td>Trouser</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Each row represents one image. Every image belongs to a <code>"class"</code> of clothing with it’s enumerated <code>"label"</code>. In place of a typically displayed image, the raw data contains a 28x28 <em>2D array of pixel values</em>; each pixel value is a float between 0 and 1. If we just focus on the images, we get a 3D matrix. You can think of this as a matrix containing 2D images.</p>
+<div id="8e4bf9d2" class="cell" data-execution_count="28">
+<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> np.array(images[<span class="st">"images"</span>].to_list())</span>
+<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>X.shape </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="28">
+<pre><code>(5000, 28, 28)</code></pre>
+</div>
+</div>
+<p>However, we’re not used to working with 3D matrices for our training data <code>X</code>. Typical training data expects a <em>vector</em> of features for each datapoint, not a matrix per datapoint. We can reshape our 3D matrix so that it fits our typical training data by “unrolling” the the 28x28 pixels into a single row vector containing 28*28 = 784 dimensions.</p>
+<div id="ec16a139" class="cell" data-execution_count="29">
+<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> X.reshape(X.shape[<span class="dv">0</span>], <span class="op">-</span><span class="dv">1</span>)</span>
+<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a>X.shape</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="29">
+<pre><code>(5000, 784)</code></pre>
+</div>
+</div>
+<p>What we have now is 5000 datapoints that each have 784 features. That’s a lot of features! Not only would training a model on this data take a very long time, it’s also very likely that our matrix is linearly independent. PCA is a very good strategy to use in situations like these when there are lots of features, but we want to remove redundant information.</p>
+</section>
+<section id="pca-with-sklearn" class="level3" data-number="25.7.2">
+<h3 data-number="25.7.2" class="anchored" data-anchor-id="pca-with-sklearn"><span class="header-section-number">25.7.2</span> PCA with <code>sklearn</code></h3>
+<p>To perform PCA, let’s begin by centering our data.</p>
+<div id="88a2cbba" class="cell" data-execution_count="30">
+<div class="sourceCode cell-code" id="cb40"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> X <span class="op">-</span> X.mean(axis<span class="op">=</span><span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We can run PCA using <code>sklearn</code>’s <code>PCA</code> package.</p>
+<div id="87c98ed4" class="cell" data-execution_count="31">
+<div class="sourceCode cell-code" id="cb41"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.decomposition <span class="im">import</span> PCA</span>
+<span id="cb41-2"><a href="#cb41-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb41-3"><a href="#cb41-3" aria-hidden="true" tabindex="-1"></a>n_comps <span class="op">=</span> <span class="dv">50</span></span>
+<span id="cb41-4"><a href="#cb41-4" aria-hidden="true" tabindex="-1"></a>pca <span class="op">=</span> PCA(n_components<span class="op">=</span>n_comps)</span>
+<span id="cb41-5"><a href="#cb41-5" aria-hidden="true" tabindex="-1"></a>pca.fit(X)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="31">
+<style>#sk-container-id-1 {
+  /* Definition of color scheme common for light and dark mode */
+  --sklearn-color-text: black;
+  --sklearn-color-line: gray;
+  /* Definition of color scheme for unfitted estimators */
+  --sklearn-color-unfitted-level-0: #fff5e6;
+  --sklearn-color-unfitted-level-1: #f6e4d2;
+  --sklearn-color-unfitted-level-2: #ffe0b3;
+  --sklearn-color-unfitted-level-3: chocolate;
+  /* Definition of color scheme for fitted estimators */
+  --sklearn-color-fitted-level-0: #f0f8ff;
+  --sklearn-color-fitted-level-1: #d4ebff;
+  --sklearn-color-fitted-level-2: #b3dbfd;
+  --sklearn-color-fitted-level-3: cornflowerblue;
+
+  /* Specific color for light theme */
+  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
+  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));
+  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
+  --sklearn-color-icon: #696969;
+
+  @media (prefers-color-scheme: dark) {
+    /* Redefinition of color scheme for dark theme */
+    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
+    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));
+    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
+    --sklearn-color-icon: #878787;
+  }
+}
+
+#sk-container-id-1 {
+  color: var(--sklearn-color-text);
+}
+
+#sk-container-id-1 pre {
+  padding: 0;
+}
+
+#sk-container-id-1 input.sk-hidden--visually {
+  border: 0;
+  clip: rect(1px 1px 1px 1px);
+  clip: rect(1px, 1px, 1px, 1px);
+  height: 1px;
+  margin: -1px;
+  overflow: hidden;
+  padding: 0;
+  position: absolute;
+  width: 1px;
+}
+
+#sk-container-id-1 div.sk-dashed-wrapped {
+  border: 1px dashed var(--sklearn-color-line);
+  margin: 0 0.4em 0.5em 0.4em;
+  box-sizing: border-box;
+  padding-bottom: 0.4em;
+  background-color: var(--sklearn-color-background);
+}
+
+#sk-container-id-1 div.sk-container {
+  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`
+     but bootstrap.min.css set `[hidden] { display: none !important; }`
+     so we also need the `!important` here to be able to override the
+     default hidden behavior on the sphinx rendered scikit-learn.org.
+     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */
+  display: inline-block !important;
+  position: relative;
+}
+
+#sk-container-id-1 div.sk-text-repr-fallback {
+  display: none;
+}
+
+div.sk-parallel-item,
+div.sk-serial,
+div.sk-item {
+  /* draw centered vertical line to link estimators */
+  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));
+  background-size: 2px 100%;
+  background-repeat: no-repeat;
+  background-position: center center;
+}
+
+/* Parallel-specific style estimator block */
+
+#sk-container-id-1 div.sk-parallel-item::after {
+  content: "";
+  width: 100%;
+  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);
+  flex-grow: 1;
+}
+
+#sk-container-id-1 div.sk-parallel {
+  display: flex;
+  align-items: stretch;
+  justify-content: center;
+  background-color: var(--sklearn-color-background);
+  position: relative;
+}
+
+#sk-container-id-1 div.sk-parallel-item {
+  display: flex;
+  flex-direction: column;
+}
+
+#sk-container-id-1 div.sk-parallel-item:first-child::after {
+  align-self: flex-end;
+  width: 50%;
+}
+
+#sk-container-id-1 div.sk-parallel-item:last-child::after {
+  align-self: flex-start;
+  width: 50%;
+}
+
+#sk-container-id-1 div.sk-parallel-item:only-child::after {
+  width: 0;
+}
+
+/* Serial-specific style estimator block */
+
+#sk-container-id-1 div.sk-serial {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  background-color: var(--sklearn-color-background);
+  padding-right: 1em;
+  padding-left: 1em;
+}
+
+
+/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is
+clickable and can be expanded/collapsed.
+- Pipeline and ColumnTransformer use this feature and define the default style
+- Estimators will overwrite some part of the style using the `sk-estimator` class
+*/
+
+/* Pipeline and ColumnTransformer style (default) */
+
+#sk-container-id-1 div.sk-toggleable {
+  /* Default theme specific background. It is overwritten whether we have a
+  specific estimator or a Pipeline/ColumnTransformer */
+  background-color: var(--sklearn-color-background);
+}
+
+/* Toggleable label */
+#sk-container-id-1 label.sk-toggleable__label {
+  cursor: pointer;
+  display: block;
+  width: 100%;
+  margin-bottom: 0;
+  padding: 0.5em;
+  box-sizing: border-box;
+  text-align: center;
+}
+
+#sk-container-id-1 label.sk-toggleable__label-arrow:before {
+  /* Arrow on the left of the label */
+  content: "▸";
+  float: left;
+  margin-right: 0.25em;
+  color: var(--sklearn-color-icon);
+}
+
+#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {
+  color: var(--sklearn-color-text);
+}
+
+/* Toggleable content - dropdown */
+
+#sk-container-id-1 div.sk-toggleable__content {
+  max-height: 0;
+  max-width: 0;
+  overflow: hidden;
+  text-align: left;
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-0);
+}
+
+#sk-container-id-1 div.sk-toggleable__content.fitted {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-0);
+}
+
+#sk-container-id-1 div.sk-toggleable__content pre {
+  margin: 0.2em;
+  border-radius: 0.25em;
+  color: var(--sklearn-color-text);
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-0);
+}
+
+#sk-container-id-1 div.sk-toggleable__content.fitted pre {
+  /* unfitted */
+  background-color: var(--sklearn-color-fitted-level-0);
+}
+
+#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {
+  /* Expand drop-down */
+  max-height: 200px;
+  max-width: 100%;
+  overflow: auto;
+}
+
+#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {
+  content: "▾";
+}
+
+/* Pipeline/ColumnTransformer-specific style */
+
+#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  color: var(--sklearn-color-text);
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+/* Estimator-specific style */
+
+/* Colorize estimator box */
+#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+#sk-container-id-1 div.sk-label label.sk-toggleable__label,
+#sk-container-id-1 div.sk-label label {
+  /* The background is the default theme color */
+  color: var(--sklearn-color-text-on-default-background);
+}
+
+/* On hover, darken the color of the background */
+#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {
+  color: var(--sklearn-color-text);
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+/* Label box, darken color on hover, fitted */
+#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {
+  color: var(--sklearn-color-text);
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+/* Estimator label */
+
+#sk-container-id-1 div.sk-label label {
+  font-family: monospace;
+  font-weight: bold;
+  display: inline-block;
+  line-height: 1.2em;
+}
+
+#sk-container-id-1 div.sk-label-container {
+  text-align: center;
+}
+
+/* Estimator-specific */
+#sk-container-id-1 div.sk-estimator {
+  font-family: monospace;
+  border: 1px dotted var(--sklearn-color-border-box);
+  border-radius: 0.25em;
+  box-sizing: border-box;
+  margin-bottom: 0.5em;
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-0);
+}
+
+#sk-container-id-1 div.sk-estimator.fitted {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-0);
+}
+
+/* on hover */
+#sk-container-id-1 div.sk-estimator:hover {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-2);
+}
+
+#sk-container-id-1 div.sk-estimator.fitted:hover {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-2);
+}
+
+/* Specification for estimator info (e.g. "i" and "?") */
+
+/* Common style for "i" and "?" */
+
+.sk-estimator-doc-link,
+a:link.sk-estimator-doc-link,
+a:visited.sk-estimator-doc-link {
+  float: right;
+  font-size: smaller;
+  line-height: 1em;
+  font-family: monospace;
+  background-color: var(--sklearn-color-background);
+  border-radius: 1em;
+  height: 1em;
+  width: 1em;
+  text-decoration: none !important;
+  margin-left: 1ex;
+  /* unfitted */
+  border: var(--sklearn-color-unfitted-level-1) 1pt solid;
+  color: var(--sklearn-color-unfitted-level-1);
+}
+
+.sk-estimator-doc-link.fitted,
+a:link.sk-estimator-doc-link.fitted,
+a:visited.sk-estimator-doc-link.fitted {
+  /* fitted */
+  border: var(--sklearn-color-fitted-level-1) 1pt solid;
+  color: var(--sklearn-color-fitted-level-1);
+}
+
+/* On hover */
+div.sk-estimator:hover .sk-estimator-doc-link:hover,
+.sk-estimator-doc-link:hover,
+div.sk-label-container:hover .sk-estimator-doc-link:hover,
+.sk-estimator-doc-link:hover {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-3);
+  color: var(--sklearn-color-background);
+  text-decoration: none;
+}
+
+div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,
+.sk-estimator-doc-link.fitted:hover,
+div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,
+.sk-estimator-doc-link.fitted:hover {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-3);
+  color: var(--sklearn-color-background);
+  text-decoration: none;
+}
+
+/* Span, style for the box shown on hovering the info icon */
+.sk-estimator-doc-link span {
+  display: none;
+  z-index: 9999;
+  position: relative;
+  font-weight: normal;
+  right: .2ex;
+  padding: .5ex;
+  margin: .5ex;
+  width: min-content;
+  min-width: 20ex;
+  max-width: 50ex;
+  color: var(--sklearn-color-text);
+  box-shadow: 2pt 2pt 4pt #999;
+  /* unfitted */
+  background: var(--sklearn-color-unfitted-level-0);
+  border: .5pt solid var(--sklearn-color-unfitted-level-3);
+}
+
+.sk-estimator-doc-link.fitted span {
+  /* fitted */
+  background: var(--sklearn-color-fitted-level-0);
+  border: var(--sklearn-color-fitted-level-3);
+}
+
+.sk-estimator-doc-link:hover span {
+  display: block;
+}
+
+/* "?"-specific style due to the `<a>` HTML tag */
+
+#sk-container-id-1 a.estimator_doc_link {
+  float: right;
+  font-size: 1rem;
+  line-height: 1em;
+  font-family: monospace;
+  background-color: var(--sklearn-color-background);
+  border-radius: 1rem;
+  height: 1rem;
+  width: 1rem;
+  text-decoration: none;
+  /* unfitted */
+  color: var(--sklearn-color-unfitted-level-1);
+  border: var(--sklearn-color-unfitted-level-1) 1pt solid;
+}
+
+#sk-container-id-1 a.estimator_doc_link.fitted {
+  /* fitted */
+  border: var(--sklearn-color-fitted-level-1) 1pt solid;
+  color: var(--sklearn-color-fitted-level-1);
+}
+
+/* On hover */
+#sk-container-id-1 a.estimator_doc_link:hover {
+  /* unfitted */
+  background-color: var(--sklearn-color-unfitted-level-3);
+  color: var(--sklearn-color-background);
+  text-decoration: none;
+}
+
+#sk-container-id-1 a.estimator_doc_link.fitted:hover {
+  /* fitted */
+  background-color: var(--sklearn-color-fitted-level-3);
+}
+</a></style><div id="sk-container-id-1" class="sk-top-container"><div class="sk-text-repr-fallback"><pre>PCA(n_components=50)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br>On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden=""><div class="sk-item"><div class="sk-estimator fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" checked=""><label for="sk-estimator-id-1" class="sk-toggleable__label fitted sk-toggleable__label-arrow fitted">&nbsp;&nbsp;PCA<a class="sk-estimator-doc-link fitted" rel="noreferrer" target="_blank" href="https://scikit-learn.org/1.5/modules/generated/sklearn.decomposition.PCA.html">?<span>Documentation for PCA</span></a><span class="sk-estimator-doc-link fitted">i<span>Fitted</span></span></label><div class="sk-toggleable__content fitted"><pre>PCA(n_components=50)</pre></div> </div></div></div></div>
+</div>
+</div>
+</section>
+<section id="examining-pca-results" class="level3" data-number="25.7.3">
+<h3 data-number="25.7.3" class="anchored" data-anchor-id="examining-pca-results"><span class="header-section-number">25.7.3</span> Examining PCA Results</h3>
+<p>Now that <code>sklearn</code> helped us find the principal components, let’s visualize a scree plot.</p>
+<div id="9b803bb2" class="cell" data-execution_count="32">
+<div class="sourceCode cell-code" id="cb42"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Make a line plot and show markers</span></span>
+<span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.line(y<span class="op">=</span>pca.explained_variance_ratio_ <span class="op">*</span> <span class="dv">100</span>, markers<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a>fig.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>                            <div id="426704d5-03ff-4697-b0d8-3268cc34a843" class="plotly-graph-div" style="height:525px; width:100%;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("426704d5-03ff-4697-b0d8-3268cc34a843")) {                    Plotly.newPlot(                        "426704d5-03ff-4697-b0d8-3268cc34a843",                        [{"hovertemplate":"x=%{x}\u003cbr\u003ey=%{y}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"","line":{"color":"#636efa","dash":"solid"},"marker":{"symbol":"circle"},"mode":"lines+markers","name":"","orientation":"v","showlegend":false,"x":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49],"xaxis":"x","y":[28.998612474136486,17.731432295027425,6.019943498544523,5.0021437976456715,3.8842224510487813,3.3986927312855597,2.443231969257418,1.7950626589278706,1.3305168356259622,1.3005200964710522,0.969189069170245,0.9429176126026007,0.7678291575166072,0.6661173968051058,0.6146656403695354,0.5883386822012792,0.5647157487282677,0.5547540961854868,0.4620259966582544,0.45406605089613683,0.4503642601054324,0.40238788978857265,0.39094983323671484,0.38012083194601404,0.3625957682537876,0.3470714932166807,0.3249613091209877,0.3173581796801987,0.3097293650135383,0.2948735434190387,0.2762981996267623,0.26975335717127397,0.2689613234177219,0.2585433889774061,0.25524639068088967,0.24512691637930273,0.23603439132640985,0.22698398395878927,0.22234539605871534,0.21383140485780391,0.21071019569885727,0.20248244218662956,0.19759674028874338,0.18374201628992515,0.1806379947754809,0.17422755537572998,0.1706272609433851,0.16656162564843707,0.16068017290472153,0.1586348027429721],"yaxis":"y","type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"x"}},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"y"}},"legend":{"tracegroupgap":0}},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('426704d5-03ff-4697-b0d8-3268cc34a843');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+<p>We can see that the line starts flattening out around 2 or 3, which suggests that most of the data is explained by just the first two or three dimensions. To illustrate this, let’s plot the first three principal components and the datapoints’ corresponding classes. Can you identify any patterns?</p>
+<div id="a059e325" class="cell" data-execution_count="33">
+<div class="sourceCode cell-code" id="cb43"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a>images[[<span class="st">'z1'</span>, <span class="st">'z2'</span>, <span class="st">'z3'</span>]] <span class="op">=</span> pca.transform(X)[:, :<span class="dv">3</span>]</span>
+<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a>fig <span class="op">=</span> px.scatter_3d(images, x<span class="op">=</span><span class="st">'z1'</span>, y<span class="op">=</span><span class="st">'z2'</span>, z<span class="op">=</span><span class="st">'z3'</span>, color<span class="op">=</span><span class="st">'class'</span>, hover_data<span class="op">=</span>[<span class="st">'labels'</span>], </span>
+<span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a>              width<span class="op">=</span><span class="dv">1000</span>, height<span class="op">=</span><span class="dv">800</span>)</span>
+<span id="cb43-4"><a href="#cb43-4" aria-hidden="true" tabindex="-1"></a><span class="co"># set marker size to 5</span></span>
+<span id="cb43-5"><a href="#cb43-5" aria-hidden="true" tabindex="-1"></a>fig.update_traces(marker<span class="op">=</span><span class="bu">dict</span>(size<span class="op">=</span><span class="dv">5</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>                            <div id="b5087929-d2cd-4b60-bfb4-bc78a2c89475" class="plotly-graph-div" style="height:800px; width:1000px;"></div>            <script type="text/javascript">                require(["plotly"], function(Plotly) {                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("b5087929-d2cd-4b60-bfb4-bc78a2c89475")) {                    Plotly.newPlot(                        "b5087929-d2cd-4b60-bfb4-bc78a2c89475",                        [{"customdata":[[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3],[3]],"hovertemplate":"class=Dress\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Dress","marker":{"color":"#636efa","symbol":"circle","size":5},"mode":"markers","name":"Dress","scene":"scene","showlegend":true,"x":[-3.4142619115807396,-4.64739039956119,2.936371425123025,-4.60681067298225,-3.334430367627452,-6.292860798146395,-4.01101044909402,1.14056757929709,1.915796661696751,0.8137329536143849,-3.7134700056080354,1.2135742779631533,0.005524671506715728,1.273686609276329,3.211433959446442,1.7005244480336907,-1.2148303220078331,-3.479891497654004,-3.3681049236246667,2.864412722475748,1.8651993596932948,1.9781432656290732,2.272479123323364,-0.7619370095003457,2.416487881350322,-6.0714489046107705,4.004505737095956,0.8925024211831013,-5.81795633991463,-4.276646968467487,-1.927478125493797,-0.4171267952080414,-5.827060619740086,4.411976189969783,-0.17929532456973926,-1.4794755269803144,1.277924640873346,-2.132108153422451,2.106674478637333,-2.6565550838861047,-4.847324456317579,0.15804482795504338,-3.510130306069188,-6.067190233019875,-5.144482059848916,2.125350916984118,2.9849908109445376,-5.226930410905634,-4.095414997804192,-7.023197480394951,-3.8556050894175393,-1.2486085113080445,-3.140208675612404,-4.118996980763196,-3.57155354947576,2.40302312293877,-6.453794756355402,3.8388904334784293,-4.135168759762244,0.8852322554052835,0.7872546350965183,-3.099773907185337,-5.823959684698225,-3.4227827186611055,-2.2922949196710616,-1.9815230045444778,0.1755561484783328,0.5438508737220239,-4.868250645993886,1.0826300346479225,-3.260491549732922,-1.3975168437000414,4.600493015455794,-2.5884279783703454,-0.09327998200707184,0.9718832806606584,0.7543087066251218,-2.1315431693397024,-1.787547964919465,-5.069009335702865,3.9384553711852344,1.656222146349593,-2.1006918976138222,-1.105222232819399,-3.680915813221557,1.6648509921913386,-4.210603308331562,1.8352520336526243,-2.572428263293291,-3.196615005464134,2.392248951810425,-1.1381093560850848,2.9277886244148275,-4.325411543070817,1.3075189758294101,-4.455499148854556,-0.9138926234110465,0.5129850400834569,-1.3566975855756107,-0.2392909025574701,-1.9197644147660455,-3.0585687834357493,-3.7503098683170553,2.5082945597138098,-0.46607216128614626,-5.675364378927297,-1.1299606975718657,1.5340993175968598,0.15310244611506665,-2.5051202894518383,-1.9210356460898468,-4.480136038677588,2.3233926221748327,1.1590329819433605,0.4493643755660437,-1.52122329877756,-0.16848148166683255,-0.13660525098751325,-1.0012294286324361,5.30710465471176,-3.031292611612088,3.4424622955108926,-1.957773878810553,-1.2408194811631577,1.973221715520405,-2.4275768004880725,-5.586768021334211,-4.3209274328970215,-1.1919420133080163,2.703039845637644,-2.1030448101968164,-5.764167667131459,1.7420434935653553,-0.13984712747182262,-0.5381047524737941,-1.0601669416900994,2.463903441966014,-4.588892314650609,-2.451153021959428,-5.61151837852978,-1.6379546046362807,-2.423588553254092,-1.1814275757701984,-3.3396233125516654,-1.910794943160414,-3.6282890191249155,0.27243941300402985,-2.8258807001207136,1.7914313857865407,-5.8973388824736555,-0.4389876307481448,4.654699369802518,-3.9646037350144017,0.45214525956241686,2.350790658408505,-4.866306549619573,-4.435943432645489,-4.790139595397316,0.4989881032582329,-4.11431422806291,-4.807417394867612,4.189084485170647,-0.7993093322034092,0.955726720319172,-4.213865801708959,0.9451837606205363,-4.609276309765123,-1.1416822392310009,-2.283184275944835,1.7685191671536997,4.377306845733647,2.569074636682295,-1.9489582696796772,-2.5187179622882727,2.770594923531393,-4.501171371570656,-1.4623585565375345,-5.8251110647351085,4.371418055977441,1.3714126861475002,-2.7883957061165305,0.85316620708534,-0.5903581532485653,5.0330481641037474,-1.0532563244779916,3.88274702876206,0.7647110280401985,0.9003826089364492,2.9719238676843895,-4.576962846008642,4.368960363787699,1.7112687336975663,-2.3806108994633512,-5.565602765819313,-2.747413393041082,2.756975972852218,-1.3556547081159176,3.459989014899425,1.4204360162564618,-6.169834280947294,-6.6793907847207885,1.1314335434845615,2.9432587267569934,-2.928088228255563,0.6120951126460211,0.8838344518156507,0.37332717593528725,0.1936276252564968,-1.1968242509918927,-1.5132013305544463,-2.502038083361288,-3.983440210951203,2.238237890858026,-0.7451475097728871,-4.648044646427713,0.3190326292934981,-3.9377747425917216,-0.8819556362929138,-5.147189231270009,-1.8490600410725258,4.384957105319954,-1.6040445421378036,0.11962783638484684,3.1104475899221153,-1.4222754713773702,-4.700971386400645,3.910572526004255,1.3682794424763223,-2.1474482839818965,-2.8381546221332177,-0.861978044320366,-6.466886750083294,-3.9169703291320714,1.3774990447210878,2.6740723008366696,-4.456752872000295,0.036843120084590425,3.4973632114665025,-2.320723818707056,-1.1577484606118522,-2.711118081008947,-3.986415014250981,2.2667433309829397,-2.3053466565787435,2.9592963045015837,2.8812807715496986,-1.413414128336684,2.4282634219079022,-0.08275826616153399,-1.3179814560634466,0.3286901964939586,-6.092938482776369,-3.709188230062871,-5.223481049215366,-0.3704518525524788,1.2784912518230218,1.7839363580929206,3.91564624748119,-0.27890493130193994,1.7559221233036424,-0.5913942398592446,-1.749850057198367,-5.991716660640478,-1.5640919949540402,-5.41991380820593,1.9536448514557652,1.9447339461840212,1.3184137601335748,-2.0868830851391316,-3.1724834927885173,-3.8934626899998905,0.41382906602963515,-1.1010691234387424,-0.4727550730343965,0.6691880050118457,-3.731284179975547,-3.2102101783726376,-4.368270664580811,-0.5360059665051905,-2.230853439941728,-2.041623743069809,1.5408093537792655,-0.9462056398850407,-3.7207452801451746,-3.124795453433873,-0.9965552709183878,1.9591481853918702,0.5841208459116165,0.9397020078296195,-0.7525840195288516,1.2103296342692353,-5.417712060100112,2.3040871768550315,-1.8913314170278035,1.0314837914345358,-1.692500185434966,0.20424725176809816,-3.7375107446206974,0.7754648639759651,2.279329216937167,-1.8638606916660019,5.35410444965014,-0.31861746684981584,-2.113013359044807,-2.776743186192176,0.6276208623571294,-2.68840791170079,3.4929430558468466,-2.0590150304624526,-4.682242021671064,0.32616016342529186,-0.76763397896668,-2.9900341944814763,-0.22936306599061995,-2.3403125069020003,0.8775966819186962,-0.036364939411091105,1.1030546428661676,2.9748263959491177,-0.9224317328726288,1.3288563058293061,-1.218454229481454,-1.9089930497750605,-1.2975671689491453,-5.672716407613592,-2.341670775321338,-5.2282322831739565,-0.6685816947647609,-4.02585495526075,3.4624097661423514,1.353019333146365,-1.229298123386387,-2.244379402444568,-4.149227127722019,2.0358219575250485,2.581159184889961,3.993533897972571,1.3780632899412322,-3.1825497287182967,3.0122058800464715,-3.2007664057182477,-5.5103827814376105,-2.4450396304206303,-0.824290372243976,3.3223732332045013,-0.7843254670834233,-5.32352282751912,4.348760636945658,-3.968478138277268,-5.907198058398247,-2.810045432176844,-2.8822444590523664,1.4880514376336624,1.5912257881890797,-6.7073278391302305,-2.4336239320511663,2.3412400808163434,-4.580244082590312,1.8265678553884068,-3.3953259567196086,-5.063083142842153,4.635195928256994,0.13547586795543137,-0.9147807877890479,2.4115694080043437,-0.678612060617923,1.3077474691843491,1.613471321653782,0.8580161520276203,-5.8822017764194126,5.710402554834902,-5.09049919198306,-1.1605649486298668,3.954432218474446,2.1476056053794457,-4.644798206913199,-3.583583904796149,3.430887819823691,-2.6516781929621,-1.5965825193247274,6.105186795908576,5.843234978435584,-2.7774997430803374,0.17049665398457972,3.4588574501642437,-0.6634086665502926,-4.840932351788961,3.55480246054676,-3.9916606047324295,-4.497625490224371,-2.1681028169898617,-0.40653014102026247,-3.2382688052679565,0.3879830904297544,-3.4054695057587905,-2.219438158657011,-3.323468747547553,-4.035478458775319,-2.369081581503711,-1.609092090745891,0.3952079563544678,-4.940076081985214,0.6802232827932431,-2.2737887892195494,-3.6541640394956514,-1.9311809361596004,-2.6674690124924028,-1.1428789780434936,0.002378733226037002,0.5717372369881395,4.27193881537115,4.321934330269977,-3.3847811730084576,-1.2672701193770202,-3.7101205270824553,1.1316234508852545,-2.7490512670243654,-4.45668853468232,-4.073944458886447,-3.637099312250013,-1.682633679182519,2.555073960150331,-3.1201720570488813,-2.3991353575988543,-0.40201459793515015,-0.8174329061930382,-3.541350088813598,-1.1456630106317773,1.6444554533699376,-0.19632264796507265,0.8438849815693124,4.599568611465608,-1.0991624808855616,0.31375058132081574,3.903284410534859,-3.5058863134594165,0.9020756152745653,-0.5608182812824313,-0.7738675816223011,-2.6954682977699798,-0.27859482111033856,-1.2716209721658247,-4.981301436917465,-1.820450423460807,1.6375186347045436,-3.2654436131624855,-2.097021394039687,-6.224503859694138,-1.1017687305667745,-4.763869407123592,-1.9541998927061528,-3.326766171270766,-4.290246004533021,-4.792809703529325,-1.4076033972037953,-0.6217808830874262,-3.0203454024547485,-0.8662702056318663,-2.5553973342962433,-4.931105462793684,-0.9260270328572962,-3.7727330502144496,-2.63599356147204,-3.3464810361321478,-0.09995394326454207,-1.1500085710965315,-0.60752836752848,3.6142795130205494,0.41391126244026877,4.753961831575043,0.637532385741068,0.5446519407646684,-3.037958696568766,-0.8072241081872863,1.7899550467809306,-0.09589507005511283,2.3784533389805023,-0.7469923149783002,-1.577185448708129,4.222049182235891,-2.20484122801835,1.5845282943629808,-0.9486680531536572,-2.7969668685077473,-5.634702930438581,-0.3010538960403112,-1.1607861712192484,0.2282952119744833,5.8552131376451015,0.6438234209957008,-5.1530860140824295,-2.2581530212002674,-5.630349466778645,-6.444841074725061,-3.922802042152049,-4.912530014376737,-1.99139852267974,-2.2402287448443676,-4.007031169773404,-5.367231028792751,-1.3100312015561133,-0.39807451085430046,-1.2179845494160815,-4.758229907273773,-2.826122184082817,-2.8067606992257037,-3.864072046822756,-2.1906868727111335,-6.488869341984118],"y":[5.408675302286675,5.004587334670154,3.508487887838537,3.1438175347660127,5.021258178020155,2.393563854719549,3.2295390889680844,4.703437100799416,3.003523769190183,5.170112897438131,4.553094574496774,4.688246922462133,5.074987571702628,2.3631453618579163,4.567244289206118,4.620928507739388,5.005397751749988,3.0751111933250153,5.042590313409278,3.734763910831471,4.811536580873154,4.7001243435492785,4.74517705218978,4.11049510753614,4.727822516926386,5.2832919879289815,4.025745094619986,3.5941407170090547,3.3105475136511315,5.360812417986549,5.625097565885132,4.748933655583831,4.539080754486545,4.069460395565476,4.767402326077631,4.880373691316561,4.484213863753148,5.77918602122968,4.67450254614727,5.305732770172731,5.051343295985725,5.305098996232427,5.02199778836828,3.887930936130277,4.5857673703857715,3.3123066275653406,3.8242734031638705,1.2746089986788145,3.645907200343519,0.9030337956724116,4.9086260103123385,4.652866078410651,4.888133861798608,2.411486267825254,5.08959397403608,2.7170104708529172,4.713440145117526,4.4903839875402145,5.235792515059567,4.520941984603625,3.7985112792372147,4.114049265167399,3.126707263728978,5.993573873413253,4.834568806810649,5.444503523461919,5.27092106908238,4.640533125099915,1.4824444228644102,4.19911547997995,1.6006695654222187,2.1261718661907647,4.128319712516668,4.1814905989310125,4.270406632681471,3.8568164751678444,4.873502757321934,4.1047383326559945,5.486230294041266,4.334973880711436,4.233801060918409,4.548053654675557,5.3996320448885795,4.532735179230253,4.673034145110926,5.000366711573226,5.036625914967846,3.863111665418673,4.282919123630492,5.451207437962708,3.996267207046804,4.604154287654889,3.299049457448398,5.540021873320951,3.2424386822688325,4.500561208361299,4.816762574320775,3.2515588655740095,5.94906800901161,3.3029499958138224,4.984932491167234,5.618095373960429,4.921614792112461,4.158475118345238,4.20246731521986,2.994931001095516,5.651531032713492,4.019585667836716,5.708261340205545,4.117118917772479,6.2722433535955435,5.465301706837648,4.693313855717604,5.22316721376716,3.592226409676923,5.8699137942499995,3.0316791548133355,4.942484075922949,4.569664036559671,4.133542062727862,2.404304622391831,4.415998888452981,4.186104431522217,5.562877331152354,4.647585738401399,3.7525135481185377,3.536728648599181,5.336909315589754,5.47299010256884,4.596226959340167,4.207347881128196,2.3935765637678252,4.835852161887671,5.0082155136053546,4.9075661385828,4.920618072891833,4.148754957764805,4.403442188366047,4.359385281776847,3.3593464995416644,5.353060952935737,5.003963664111813,5.849164906191166,4.348114516081623,4.905038121782413,3.7351759911224995,5.304919924378915,4.991742894400539,4.011415367754757,4.54220255585022,5.791642920513076,3.941111097103754,4.044880365655992,5.134972154803093,4.757885933013343,5.946862923493618,3.8302372957774007,3.524400655550831,5.030162214705269,4.907791497113432,4.871135307533629,4.354629994852909,5.414206133944467,5.109440115387172,3.7326238358062818,4.099391436922637,4.784543147827096,5.077267431615747,4.966116213735079,4.491715193895483,4.388445224808305,3.2527531747370872,5.123043611791234,5.678183722091155,3.6028122501146287,1.5825743057410964,5.196259526068938,2.1059663117398646,3.2894166299636227,4.8959127639589015,5.5310851680882624,3.9298737099772048,4.519682788542,4.323875185595396,3.303302217120453,4.5612089069459385,5.233063487835111,5.311219990836549,3.7864563762762433,5.446834892458832,4.5980525888339,3.0954116068550506,5.034044459131589,0.762438480691707,4.078906261500635,3.9785534503039717,5.894510308264954,3.5961715163445254,4.600632292138165,3.661136629745345,1.5250304736053106,4.937603223197302,4.881158482416511,3.794625970825232,4.3002082178433945,5.441761202901114,3.5952180736428794,4.821297828185028,4.557472401847733,3.0753164910498074,4.593146543769126,3.351249286803815,4.511823892580112,2.9665862071517637,4.099561176834822,4.4677928179431365,4.650283307754222,4.701398661702551,3.7729085685358092,4.030528369167423,4.663549845935595,3.4815210265131546,5.373778109919489,4.528594366777302,6.010593717498,4.209433442498542,3.1681954822127225,4.538681143687627,3.568142626813192,3.1916453638129383,0.46038645500878406,4.303175532861048,5.501702436912733,3.892574736891333,3.537810536140599,2.7134623317348394,5.755404939128861,3.7081174873285843,4.700157947157893,3.5430816911912926,2.862011917613528,2.399883018435512,4.795287551767361,3.095819281599486,2.7397810691541813,3.9618056051383297,3.65956127457954,2.771284626567124,5.464955261404933,5.44682501221536,5.22853815426008,4.453073799536123,4.755331959668447,4.226455307642126,4.849452514098276,3.957949841946249,2.3693135424167036,4.449871969724434,3.8989091730405208,4.472883870845582,5.457909730134063,4.665029470371521,0.747303471782902,3.6325239646136787,2.3199608732405785,5.162573670720015,4.810667587761831,4.947132039996005,4.878229124037817,5.893155001622545,5.458413398642996,4.19567744066304,3.9068329733381018,5.101899062320103,4.997955496052714,-0.1839887504422413,1.1312750560757479,3.0349182509923205,2.593839643063508,5.01253899714808,4.771192818684425,5.70977886294421,5.7173518718610055,4.28882854744846,5.585498416615709,4.7104736681126385,3.4002125468684756,3.988601568921028,4.360640142307242,5.081829700679156,3.9266461215083526,3.4010180053789556,3.5910592839724864,4.467007215809844,4.901495588569373,5.405187092629393,3.9928682353608,5.668080678498845,5.106468846333396,4.903046751859085,5.076005964231141,2.65238705407801,4.311582606567226,5.547456958561369,1.8230005589490148,3.9093319610931814,5.572366319782589,3.2204882158443566,4.223211698030348,5.373189787485417,4.958605015818223,1.7542672328175706,5.712112780425322,3.986539662700283,4.517084481466244,6.15616092836443,4.486338926635375,3.6667977590768444,3.300028057451072,5.048996815488415,4.2429660969760965,6.003886908648772,4.527618317746214,2.169277332445259,-2.298175437480155,5.368516961371638,5.761638821949603,4.451372971331953,4.6777214615478115,4.1960821155742645,4.615383509228625,6.018063083023199,5.069386563965871,2.3087230661780853,4.675231370635352,2.3187416600365185,3.391587396952837,4.250427844670408,4.9098071336299,4.553161390720813,5.339086810631207,5.066928981859721,2.7980291862591384,3.6383004057636747,4.01727345615554,4.297787521198579,3.1922493988404397,3.5897081034189977,4.288411354272249,4.3395604988388525,5.508348762917546,4.227451015246147,4.527158073180074,1.7520897266272117,3.0136113370828563,4.224843523263692,4.540459936316587,4.694003880137375,1.467414225065055,2.915607650280664,5.7557907146566025,4.309180141114437,4.275572486067905,4.6238582819339875,4.846591381466386,3.391202836360917,4.7292966201419,4.498767564281792,4.684549884621995,4.560788663613767,3.766336215071101,-1.2651966110947523,5.510960921524945,4.146495004847357,2.8323499504249874,4.049146614546809,5.669529886389731,4.112621916257334,4.067364647457768,4.50089997802529,4.128632709444592,4.104615033880746,2.232574991682165,3.7248433006751007,3.280153385095061,4.514782076174165,2.4005019830646894,4.23970099159849,3.7336942887684272,5.202749570319689,4.707371370070781,5.0859202873059814,3.728948858415344,5.293506775562407,3.2784810680377143,5.15060574838064,4.294650480406306,3.7146826032311955,3.7065609565751387,3.7360117835240074,0.38213957597744386,3.4659618309010463,5.238671624431182,4.781146935145489,5.256418398718468,3.95133541005289,4.756346154483339,5.767243810322567,4.136070517588003,4.93234628089465,3.1291797476047982,4.388091359335334,1.7763307824551762,4.922773787941454,5.114184070949494,3.168266965404257,5.037500783432146,4.287358799032379,5.659551603270361,2.6530433245264775,4.28747236290834,2.95798633181641,5.542452260266024,5.608021026792039,5.714236221102945,4.416588248914964,4.669587848969732,4.353066335359627,4.561276749982312,4.600134352190744,5.121783967821442,4.15656623530243,3.958354183937719,5.293277015157019,4.477843330193723,4.887526372412117,5.287360453685592,5.569115232197737,2.1559263016189307,3.5907652304618938,4.704158771210371,4.913501925489657,1.206508843696054,5.54061606520412,4.407710164982918,4.056240698473669,5.265805283756674,0.20702195188346179,5.116745093151593,4.342434372052961,4.31415700262542,3.9706026658593414,0.7653421079917002,3.9643974501099715,5.711741707828675,5.26755961665307,5.57283529910334,3.0933098340739056,4.887563709042436,0.7135914059132299,5.078749285223618,4.952304003549667,4.998576000410941,5.040409323476584,4.829588673191224,3.1437156840805196,5.647329780002281,3.1964402556962277,3.734677348355574,3.7368629260024484,4.062892499253398,4.335196872190623,5.1394614079094,4.556222075573719,4.906619572026579,4.929184010934226,4.474854192260097,5.64215144821936,4.423027099014558,3.5637685905605436,5.240823218541769,3.815511830635387,5.039305576324123,3.983363967397526,2.9713401603439635,4.233009267259974,4.183276395808066,3.886761222721353,3.832191605320113,4.580945443261218,3.7746873899701217,4.576888301764938,2.4440801114141544,2.048675735161202,3.92847388653436,1.4637083388045957,5.9165077011204765,4.9478483037059044,3.297882981679576,1.7379348733914048,4.303330277600435,4.096304558431461,2.7803900715823686,3.0095112551494023,4.663427972261342,1.6674907427941676,3.3854287739102333,4.535904682944115,2.0846106202292836],"z":[2.104316326404482,2.4511013512124022,-0.5717972044499838,2.02096154004083,2.66344569923487,-0.07626542786858688,0.2685491198999339,0.7764853146653241,0.03474556383156222,1.9890303106493499,2.0225530690353155,1.5818266825602498,1.2540489924374205,-0.9396735623112547,0.9226691997245695,1.503834529107935,4.021841751913214,0.6053806626887046,2.4196723403828138,-0.17219484675527766,1.9037317936517133,1.5463328827767167,1.2905992978527503,3.3530283535220375,0.23397651026956753,2.855200992050051,0.021149250353515486,-0.5755009864627924,2.614296015529131,2.0046383247926762,3.6795393727952086,1.0966818046178586,1.6539157814706325,0.33270907526413024,0.25044428446081557,2.8991327164380225,0.7364660569594663,3.7466446086583622,1.1188737097643378,2.260912219125908,2.7376847981271277,2.8524124419168455,2.5746784543195926,2.0178410437492253,2.3303118905971267,-0.9606084218306258,-0.7934480301893395,-0.253114372037445,2.24322146155147,0.16075225653816208,3.4268406975206087,2.8421568748998585,3.2583904050859713,0.41676358750119974,1.8013466261463889,0.8119876628545833,2.607542159393157,-0.11299676713287049,3.0582939376541822,0.7524141482288783,-0.32890448086985774,0.9552793462925474,1.106844949992012,2.730001876544746,1.6808948107232806,2.080977294429681,1.7331693964979111,2.118533003169597,0.09290872798435772,0.6413196246890553,0.026458425330744405,-0.8367321327779627,0.16287471023456154,2.0178848582672386,1.0896225876493526,0.08098870961392991,2.095184121121621,2.097653053176549,2.9759704063369496,1.8499781900346526,0.6206104051150041,1.663860902051464,3.2451789025383615,2.371152329505993,2.3446688076302835,1.2627927380749662,2.2345824878361222,-0.7284165267432999,1.2026225078703812,2.6412514119216643,0.9980995771520605,2.5870851474684544,-0.26340091631916085,2.640893735066918,0.6888283154443736,2.415911540389036,2.7700950649996723,-0.083750396889725,2.717177941640635,0.2935141521217659,1.3801339363674832,3.6006215565098847,1.5738014965274867,1.5993322923675481,2.8085627502743002,1.6685504554575863,3.1491113987834227,1.6551840867106804,2.1876692629857253,1.5465555595645568,2.4751003918841112,1.7960818200521107,1.196133928666131,2.032150445887776,1.8567752017722494,3.086653940006681,-1.3434037867069442,2.379841466897664,0.7924209362404708,-0.5857183549665633,0.6798745957909057,0.9696888673606722,0.9841237720119501,2.6356521808542337,2.036054427605543,1.0388096656893058,2.222342842610684,3.3405641804959743,3.293181620155542,1.279995256075104,1.5911264813446913,1.03849070330941,0.821401779553464,3.4173805946374625,1.3707319944830805,2.6885926801183313,0.11180146295950841,2.4390319455019185,1.107710030537208,1.1003580107428945,3.5249022052659154,1.7432327584650242,2.890871135832484,2.8545444601979564,1.916370779902721,1.6840092501274324,2.2622357197797323,3.327490663742905,-0.022337942270331996,2.614789273592424,2.066609865131398,-1.9022548827459662,2.4634575232112836,1.628491034988861,1.015338857898955,2.904117952944476,2.433457058027853,2.1213436396213017,1.8042012362827908,3.564152582762705,2.365800599881664,-0.3267893594461122,2.571064738682806,2.712982260844157,0.987868078210748,0.7110857107206144,2.567308100101947,2.0951640962982965,4.1979816107739625,0.6971237684520868,-0.424954062149929,-0.9253971715414231,4.148297164823885,2.461212542342054,-0.5793865180907484,-0.4851858793263178,2.637782675441295,2.153942997553873,-1.676887695784571,1.4805195882645814,3.4099302268207015,0.35998284083070836,3.780271456034431,-1.2740234379309352,2.9978130187360352,0.23600591155462827,2.5064535952404565,1.6719195354969663,-0.5641161642436365,2.8136998811181497,-0.37389297050230313,-0.7180819797358831,2.762037763018269,0.008494671164657491,1.8802820746381976,-0.3944258470345764,3.3270396718082305,0.13473754481711622,1.3003276861386306,1.4888734440931555,1.067829148493821,1.7387576375298317,0.5940271392288223,1.235268736785125,0.18208495449853362,0.051376125354387445,1.4301424629921502,2.952998505242746,1.822415848195914,-0.937732549017011,3.983397288089407,0.8829835384882427,1.3174090543088288,1.0606411161269194,1.4426115569403506,0.9154604474683399,2.2809366259601958,-0.4207007332669857,2.387506333106207,-0.03925110519294446,0.08126394212973297,0.6428079283168521,1.9574908371317687,-0.09666360914767289,2.986755239127765,3.442486815837811,-1.7836690716331232,2.6766951596699124,1.6722872341155788,0.7097171984833748,-1.3417408526533101,2.573009149162572,1.5622136287302408,-0.025267126895373873,-1.128557549901041,0.7384202359252315,1.9809873818896757,0.35754247405418954,1.5330245555672002,0.4903562716136228,0.06043937973775266,1.1984548496023146,1.2589433618164498,-0.8735769252510761,-1.1608433413241033,0.45029687057951323,0.9095589210428598,-0.9265817619282458,2.0597190237232,3.6475521410024916,0.8824479491361784,2.4544595250385433,1.9697055065424471,1.5981888571556735,1.2986230595022878,1.3390311129017372,-1.1476258398197183,0.38607112045148395,0.06804377660947937,0.03146672971396861,3.01257260958462,2.769029797715003,-0.15850532446372895,0.30577840989617083,-0.050172017238149756,1.1366718696401557,1.6920280403823982,2.3711429994153805,1.6989991666877993,3.4961658650958536,2.2978046559700007,-0.009609068113405205,1.2877892900735997,2.8370053675051636,1.8470692205742276,-2.1934191070798406,-0.5868665938313138,-0.30164978241743373,-0.35163083286642355,2.8585986947163025,3.5248228417985454,-0.642524047442815,2.8593963274151712,2.016876664298233,3.3184680933896584,1.3258578920311275,-0.6400742755011352,2.6982308466542175,0.3253236578694932,1.9217601026033135,0.20452163605725315,0.8435470278415196,-0.7361032675651002,1.1881079293968024,1.6230070711787332,1.4520817365536203,1.4917523467430278,3.2885830858876814,2.523560689145819,1.7050531401512286,2.380144952253184,-1.6876197442704945,0.763934891123318,2.6742558562376244,-0.99146328913393,1.1712325067845737,3.934050417945508,-0.4182628780600643,3.428556089198728,3.145586436550909,2.6930725543750973,-0.7373921419023749,3.891770559981667,1.1229153854132343,0.7260768402681719,-0.20322927824906886,0.5092144412305061,0.5208516861774019,-1.3729229280740658,2.1670662517243575,-0.23659650239890578,2.920931000392766,2.408582774490823,0.14506103030326561,0.10903966873367468,2.894752108069146,2.674471262199528,2.143192618772933,3.637957709254516,-0.7333557096353639,0.34018897853569713,2.631056690455006,4.075249298551702,-0.650652896552626,0.3050333320943291,-1.7163758769554387,-1.1307269380432128,1.3913417416720628,4.03500630909407,1.0861248231320586,3.369810178292612,2.4611839304346748,0.6360395070134018,1.9839585757912646,-0.33865713413416076,2.5741359053200976,1.510590504383828,-0.6361912820899727,1.1390226916608304,1.6521897935962617,3.2319104804219525,1.3882850124418595,0.5101782243849524,-1.3904588372872355,1.8821352399866407,3.6514966187283364,1.8589626114804776,2.212378748707662,-0.4777393279919623,-0.3957376518396202,3.336604523112576,-0.40783617824924756,3.416428107188693,2.3441442318600894,0.7136073207705224,-0.01553007019137781,0.6919098500167036,1.1560034241930943,-0.37404662886306517,2.4139537917055796,-1.6747076467541528,-1.3916320794299712,3.193497098375022,0.5789902613282064,-0.8933574548785114,1.6317210632952965,2.2198115835833057,-0.43824368148143306,1.6368552624955048,0.6917639134014296,-2.009233777571626,-2.172101000809211,1.7863762235364127,0.20451970750155407,-1.9621543736187785,2.0735163502340574,0.5022058411487138,-0.3324303641565745,1.1953492831107353,2.387986406835207,3.287336503074519,3.0523129816704984,0.9569514428438849,2.2770704830072237,0.7816436373084603,2.690642362499602,0.5855201807103623,1.3928300780400211,1.417633422153632,-0.036260580326759155,-1.0329759913950551,1.9489456246099794,2.323041575075672,3.785021425424247,2.4583649692843372,0.9055264168747373,1.8937203995168592,3.047925300228238,2.5134987994977287,0.4413878611235242,-0.6212396736622039,0.2552505763949463,-1.0673215375369691,2.455542189107877,2.818357486336597,-0.6299012763051212,2.738478670104106,1.5700528446670807,2.688616602249132,2.934075587731418,1.3697985919533613,-0.518230233645644,2.6047740825816272,2.255320462692095,2.6221420445766026,1.779045265466374,2.6903389123831394,0.20849811784061678,0.689815962984145,2.4715423220793395,2.5835089608339787,-0.3196323109800334,0.5258744639816579,2.8024384219033296,0.8110251050207737,3.111844051991148,1.5248812543514916,2.2995332318745416,-0.5332401817665893,1.5025459507462828,2.4781213943371387,2.084992927412672,-0.6654241470370352,2.462641337783529,1.8105832162020754,1.867703010326066,3.6253359164269128,-0.9750614063781791,3.5536951360434257,1.6124378708951852,2.9941510741123247,0.5483982059034421,-0.3026261475493759,2.448867852788571,3.5200740095946728,1.5862098062975551,3.964723340939261,0.8115720967556234,2.6998121258931267,-0.5578418316684385,3.56263844586392,1.9462745585450332,2.1617808236772884,2.5342281977436634,1.49564619180994,3.7954634754321317,1.9443182283352851,-1.2826100193261951,-1.0861568039712906,0.10160795904861744,2.220433577787844,1.6481050425154589,1.8709801681426397,0.6593106877089394,1.9254431960062366,2.105439424386583,2.1566142006369216,1.4147879701322856,1.1250320021770408,-0.936863863528809,3.0006020173109373,1.7791779673914263,0.2037684193558711,2.5870714388923868,1.1130908015937353,2.720044067989208,3.7642368325326405,0.275801318654061,-1.0878044128211168,2.5774836925995177,1.2031495809633599,2.51367019237286,0.1392370873194377,0.9784396224365164,2.0872695559384256,-1.4842737532540387,3.330582285125838,2.7093775339382886,1.652113938671754,1.4458180812494055,2.4009875580948674,3.612272313340732,-0.3202100024780601,1.7050743191763384,1.51646207766597,0.21962360827242994,3.0708155577433893,2.5213953361295736,0.8635263569111987],"type":"scatter3d"},{"customdata":[[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4],[4]],"hovertemplate":"class=Coat\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Coat","marker":{"color":"#EF553B","symbol":"circle","size":5},"mode":"markers","name":"Coat","scene":"scene","showlegend":true,"x":[-4.623657176975805,-4.9113081578299616,-3.4326214968753823,-9.098153743351029,1.6924303624679349,-3.625694403545645,-6.599648624131576,1.7314843157517403,-5.6132214122549975,-7.5980319835008645,-5.093356381440138,-5.01421080458576,0.2554582879593209,-2.204004450700582,-6.556031420888862,-8.979781245978364,-5.8187781462290395,2.401678128677559,-3.573771579385607,-7.678486890485158,-8.43848589420967,-0.14536301116486405,-3.89582228854445,4.518204812057816,-0.9543062810889975,-7.183180510011018,-4.429034997043778,-1.0931017851629357,-5.634436246503506,-7.528121145584743,-3.9413003027921594,-1.8807905931270683,-0.024324109705805792,-8.584714767063065,-0.2946450749285207,-5.21841764002722,-6.255909614498982,-8.165629393462632,-4.957890487448754,-5.605688727146646,-5.8879920153948895,-7.817957101326315,-5.992564916091754,-5.512745701190285,-7.433219875115695,-6.259031084225095,1.6423198886444512,0.03879736273878382,-7.998832063675451,-4.801257694373448,1.583258353579529,-6.062927892321561,-7.080671879854731,5.225435195305222,-5.9205300209227545,-3.4197264742136704,-5.9060633401594735,-6.28105901844825,-1.9873771488901475,-6.820326813693421,-9.38763896453997,-4.989034442761346,-5.72888490884751,-7.647619189604639,-5.92822328761868,-5.797875145796338,-2.342534992581946,-1.5818406612529428,-4.722530861274979,-9.69200774813693,-3.8340740224534318,-7.269876631257378,-2.029141608541607,-7.454797965574858,-5.527969767399693,-3.6141857686672063,-3.626263497645138,-0.9350108461507745,-3.7545510518720366,-0.14075246209438286,-2.2570778100382998,-7.452189749627412,-4.09692253156569,-8.016709000089483,-6.1166855054719935,-8.364998095616935,-7.719464349617863,-8.799767364892887,-5.73402351192616,-2.933073719378736,-5.907463471251689,-8.191590129814902,-5.615167413212579,-4.895225595389148,-4.683580635593973,-4.189961603239109,-3.2323343777121787,-5.187642866093897,-5.943648283563038,-4.601354908081996,-2.5806526189067203,-5.490664476556651,-0.7450260697720721,-1.5725604522611127,-3.5157286416315188,-7.614180935340365,-5.294393017567651,-2.787842476913021,-5.189123700839705,-0.2872721614568126,-4.219976346261317,-5.772929639856589,-6.344281367375631,-4.131399999611072,-6.08163819951586,-2.9793570486695886,-7.146128793635621,-0.18467946713973074,-2.4218136721687453,-4.319817652714401,-6.958154423435035,-7.129472531904053,-9.375710220725011,-1.3760871874392764,-8.016319215097639,-2.159296698122323,-8.639866290462065,2.7752876384180327,-4.713309242029423,-3.676086169975215,-2.2028803438403965,-6.078895170919144,-7.973970824231406,-2.173091521963448,-6.055048937807453,-5.654415451407464,-7.067381106745124,-4.8696896226304585,-6.187565626058083,-1.1292248708662513,1.0241051542468544,-2.10030929310308,-2.5479826313500116,-4.072290860479256,-9.207178804924332,-6.14631083402396,-6.467409865002335,1.4294773149465418,-7.933112129863334,-3.054363274436991,0.6329043875489876,-5.13465517517037,-5.653548232422165,-6.7810366475094686,-0.9465026594684622,-0.7350676457971536,0.772479150680392,1.4667475516101502,0.654288027712031,-2.50636880577482,-7.710615112934928,3.2829992753817034,-5.048015834840758,-0.35944461063130273,-6.446533653194165,-6.128975845580987,-7.037845848653487,-7.009831909801217,-6.660026435274638,0.8891318384673638,-7.200845818536505,-4.322795263913009,-5.2313495162556105,-0.7706152502872311,-3.3239209185775027,-4.366822429937417,-6.061072835995386,-5.625155055110783,-5.016886787234329,-1.971214199991878,-3.0394043563809827,-0.17943319343025674,-6.203521327884305,-3.4133791844542785,0.6314950548163616,-5.086817220420643,-4.471802175012555,3.8016254531620466,-5.458351358514245,-7.327388994729958,-4.002342912391347,-5.738424172479217,3.8304854116277984,-1.742222098707425,-2.8262526567663135,-8.607427932216916,-7.061595533767146,-7.098566467751011,-2.969406767400216,0.3888325563675593,-3.538384686077675,-4.785202512109353,-4.810954921012728,0.691960318335928,0.7817071369876085,1.331129331375071,-4.698912359180237,-0.9806293711335837,-6.7394904443357255,-5.476792473701855,-3.269067420905699,-5.741368938729451,-6.792318830440237,-5.838281303617145,1.3748976531690131,-8.57928944407147,-2.1588016826790337,-6.990625378422954,-4.502225186970255,-5.2506628407854095,-3.5372844277364277,-6.002207766049711,-5.898172480641652,-6.128261672765275,-4.981406650512273,-7.228158260463931,-3.80640703229298,1.0229608266302634,-5.410982513363419,-3.4151317837913684,-6.855974724031378,-0.9667290063335073,-0.7462703763486686,0.7675401204828334,-7.3267193602349785,-5.2766616474260255,-2.744066396933349,-3.7714794607993736,-4.908628691505757,-5.750539692269325,-6.255107193846711,-5.559118591303121,-4.061688217156341,1.7663673576552648,-0.4799215654595708,-0.46418119775565264,-1.1835006479791599,-3.3809054545232042,-0.8955186781509086,-1.8756457192312788,-3.075003296150672,-5.900034965638035,-3.410339806337103,-8.66793030613288,-6.4483578613026715,-2.8760412686409254,-5.481077051448301,-3.735781637255428,-5.35157301271017,-5.76790668609468,-2.7560288807362574,3.666660564220532,-3.101144583796581,-4.157040815985116,-2.0470932933825163,-7.663213480216439,-6.189714678790233,-2.5216624136577903,-6.090593740454167,-7.810696232160185,-2.3882689107660133,0.3452561923148634,-2.3606591941144743,-6.208188263770336,-7.983600066104336,-3.505715796331168,-2.589260124976931,-5.0407687744244525,-5.763576580742961,-8.383734641910676,-3.650997874395748,-5.508440453834097,-6.208063126450429,-7.49299904138692,3.780495289436026,-5.074146937569892,-3.1113268276861925,-6.86036760171048,-6.494100575488517,-8.247923921493529,-4.590877504178301,-5.603246285070863,-5.8239218280645995,0.12981772783870718,0.0032023218714145843,-0.011054487723965828,-3.8402040610561112,-4.970674718875382,-3.662178616525934,-9.20462616391473,-1.0624910712471567,-6.130321276701202,-7.0309244570970355,-7.4627203288713515,2.450023050436311,3.5491376616149286,-1.9970327803281873,-6.56224223019653,-6.021612513971727,-5.9466735728849045,-7.732087961996076,-9.14440865398608,0.1542499741868344,-7.589037301532185,-1.088381859337427,-4.899675315226413,-3.8101369659178155,-5.258052354153776,3.8991450732872757,-7.288322563318388,-3.8014420932734634,1.3507287032273039,-7.631776757718012,-5.261253208250237,-4.858081554759742,-2.208492463645594,-2.538722941551879,-2.4530251959033333,-7.5984814168090455,-4.997156732476804,-4.530307004904079,-5.148824148200243,-3.9208418207147875,-4.443906989116097,-7.429121399420398,1.4255687409878028,-3.24633347321796,0.3038880622871886,-5.468993617585712,-7.097073644658251,-7.1943673428312,1.8684810633072733,1.3354744415889128,-6.86437832944358,-7.875856904527928,-6.092813122799108,-4.899123402158206,-0.9698231063250586,-4.196477590804831,-7.707700393060415,-3.946407170044899,-6.1722097078447105,-5.167931137690929,-7.477676987811231,-5.326443282465064,2.261037139872685,-6.190431528972062,-6.952074517629531,-5.8455815304352425,-5.708084410856944,-5.453553115756258,-3.649087984990311,-4.203318963710696,-8.70110649509626,0.7376817907241072,2.402699017194744,-4.31252742352345,-2.464391721135748,-6.723891591669768,-8.620100077918927,-4.569690531887241,-4.088156484285213,-6.021438318455804,-1.5377503893971205,-6.71088118720634,-7.272019329636907,-6.563005463445862,-5.251554540170304,-5.779047118492494,-4.450274282836829,-5.608414772596722,-6.378915764244511,-2.0660024651192304,-7.022606916071325,-4.254643314874122,-1.2217625163046995,-7.7176699009889616,-7.233132272218363,-0.6999769676254303,-5.4996830589510095,-6.517678885195696,-5.91024133032087,-4.284242063771383,-7.444696251794283,-5.704633553473798,-7.233155782682057,-6.387179473325974,-4.438888373317652,-4.203488091076657,-5.360155045518919,-3.725749292255968,-6.359615575395183,-8.244843497631654,-3.9422377023291193,-0.7602989920840004,-6.241015660605534,-1.1828471220140777,-7.704168072141809,-6.349514978096138,-6.14057623485058,-3.290025366931252,-4.3888687234990975,-5.642762392275521,-7.0712094659433316,-4.999112835506128,-7.508138517118478,-6.445476069241774,-4.709084558631308,1.2425808641881901,1.6486810023247265,-3.5206246915476687,0.0908129671315404,3.7968236336312287,-5.828419621157313,-6.933571580322411,1.1602652732066396,-2.652182760120285,-6.698753197273599,-6.18454291949176,-1.9604788255131471,-6.3506527497936975,4.968119433574563,-8.322562401693583,-2.8343324554872815,-6.057468032458137,-7.6239323725384684,-4.858363856389057,-7.788798949504687,-5.747733828739006,1.9949464718923993,-8.67386403609685,0.5855590824267926,-5.840320374287766,-7.209520907843167,-6.12721378217658,2.1011591004117682,1.3584571796771958,-5.857927275643731,1.7253112946951614,-4.515029178969848,-7.681613656282754,-2.347148021168598,3.51681243342666,-5.280448531904515,-5.376414975193609,-4.153369180773334,-6.715876432512916,-6.769841450436179,0.8465494399319629,-2.89490657159866,-5.0039916009731575,-5.684978821742225,-3.6838421277449545,-5.560703984866425,0.5231580942815934,5.422009114284187,-8.512771591245354,-5.894984966568538,-5.580995960150562,-4.917976833233574,-6.798876982713292,-2.246687308063596,-6.439440422967355,-7.656418030370101,-4.2716201864061825,-4.592829267199073,-5.969314882166482,-5.037847897817607,-5.275938536871382,-0.07616231646095756,3.4813266146318966,-3.0529393169959302,-4.640011025700825,-6.779667144826327,-4.70846529290044,-5.516772277034585,-8.729874125227301,-2.8851273461017763,-5.987900530241416,-6.838363366009587,-5.729324183486404,-3.2571294596860803,-3.1638616362540293,-6.015268697183291,-6.589729436612274,-2.603282350616147,-5.023960719220698,-5.4506599827785935,-6.001556572892641,-3.4636615976471163,1.2641156636052082,-2.7427332246340286,-7.266593296313516,-4.070174163885234,-2.7883621726395664,-7.907013468023664,-4.557857613024727,-5.876036986255171,-5.482503089105892,-5.208602076435555,-4.158387499601446,-7.099125503679965,-5.231078877503508,-6.9835948443831874,-4.9922770722763135,-5.883892268209748,-5.553996066130427,-5.306830820380649,-3.962546146411894,0.7296239646179494],"y":[-2.064728240887919,-1.3331449696336004,-0.7641053838240548,-2.243329171735953,-0.5329188435516429,-2.7044986633664645,-0.8008191984279686,0.7020661492452979,3.6514827391288405,-0.7013917213957578,-2.8337143534732534,-0.9826948795456014,2.3641897644107974,-1.1342472573362528,-3.8886643036446644,-3.4587167010126287,-0.09403442132348132,-3.8242681863252983,0.4868041560135869,-4.800450667305009,-0.5129062061132749,0.7550870078756353,-3.2782966393597017,0.697547754149207,2.3003880595969632,-3.321557403719825,-0.42614896599379626,-1.343025512767354,-2.2117082207141925,-1.5448677003324465,2.008359251275216,1.6389907650963182,3.2192331119956763,-2.5099430389841313,-0.5604973278468393,-2.7395432768167676,-2.194189955458223,-2.648987252209867,3.9582704616047693,-3.2668141396317267,-3.289978997644389,-3.8234362566256914,-3.336049681056256,-0.4930054310698692,-2.1099198857256978,-0.855306142052117,0.010198673369065188,0.0906381976628191,-4.366711215086938,3.220135598120865,4.808510263076868,2.319503208242924,-2.992361279931229,1.427768240424489,-1.6145527540814424,-1.3422231178414024,-2.8414695905272147,-2.5487075160547614,4.855929649044693,2.281614130336578,-4.206617253542927,-4.792637943855121,-1.4392147570480553,-3.5514744264680655,-3.1810424435191265,-3.422477454685048,4.49893176352894,0.49380922306549385,0.36686471323376557,-2.8004029822583822,4.2602974671536264,0.12151223232760025,-0.5466124371114072,-0.8409298475574197,-3.683233733003528,1.9401929166912604,1.4427101896694647,1.794366555412891,-4.488260667620738,1.762511038722843,1.0394872161224717,-2.4427329490620124,-0.6957919533662615,-3.348555075010165,-0.8637900436872524,-2.834824734438612,-0.7437179603429697,-1.745258736114189,-1.0023036618937404,-1.0179462398386594,-1.1279009681536913,-4.208557174967618,-4.0305118146382855,2.1539041117819466,-0.12843819646080484,-1.6763148216094073,-0.6229810477088582,-1.8979946000864272,-0.2697964564867522,-0.6524429295608118,-2.1766626543825374,-2.9807836022235152,1.4134493511537332,3.1050232899880887,2.2848089789339707,-2.607679919541925,2.5508091562056996,-1.4355193832854305,-4.3924032678636,2.1829429312841717,-1.3403256529878507,-2.8718682023454654,-3.834582665474913,-1.4410671243866444,3.240113489703184,2.8387367686714735,-0.36207226201332576,-0.49115215104121657,0.11028692237168332,-1.825324214288989,-2.5031919658315167,-4.237798207462714,-3.3367275987498335,-2.8050382253228516,-3.47213470507662,-1.540592737170387,-4.280271170218379,2.741279990707042,2.2331766379907085,-2.3566193268032696,-0.2202731738393121,-2.912569316489331,-1.468736832400663,-0.6190386583447899,-1.7948030603854699,-2.6003309064250115,0.5548594827165569,0.6438310791797579,-1.62747020821003,0.7029560919497012,4.229064465098517,-1.630736142665123,-1.040459126045829,-1.2826790861168702,-4.240052517100835,-1.7563945969594779,-3.281617780901447,0.4763686266581065,-3.5675655974994047,2.9631478086361054,-0.9404564909667212,0.25096081369928414,-2.41515585681377,2.46593196375567,-0.6747258999876179,0.5321363909812064,-0.7864784838598922,0.06231825258447951,0.10052175288066946,-0.8793778076670554,-3.6033629943541006,2.899060740193256,-0.5394946849509861,0.12576151580286482,2.7331084965625707,-2.9366707563011865,2.0463779781451965,-2.3685459850731845,-0.9742865313283887,3.1100415409191036,-1.7013175372698266,-1.6395332325532266,-1.7064056763624733,0.17362570184370194,-4.688757744260731,-0.6608820702828239,-0.43172894101877285,-3.20926775341787,-2.0555250631559634,-0.867976458128442,4.050218062274558,2.8317500908772866,1.4709287379862472,2.788994485479625,3.6996303500781242,0.9444225674571728,-2.5040615600035507,2.2071663789377993,-3.3045468446005843,-2.674433308730921,2.323236220908275,-0.9963979731805698,1.5118337322091917,1.1078588605066306,-3.891099929250433,-1.944340686719979,-3.8083092992768024,-0.5361110234171756,4.968885331844853,0.3839980325456708,0.6326101465920305,3.4554610444099154,-1.3432705678802905,0.07401395213754422,0.00124764825660917,3.0227076644282924,-1.6941267218301215,4.778240288056808,-3.015779290744102,-2.121905602391251,-0.9078747053023211,-3.849082594582574,1.2747621111964678,1.3490307473057706,-2.052586241761259,-0.6870559423120368,3.0119436295989463,-3.133022769698266,-1.4711502206708882,-2.079868930485351,-1.5050415660478433,-1.9432867665867157,-2.281628629700606,-0.5901116485480815,-0.16423089600675794,-3.399130943685178,-0.49036250659434427,-0.5230035173261781,-2.1193359947741084,3.702410906603944,-3.647347863806395,1.3245792659297921,-0.40741290412805287,0.9118221551551899,-0.9322126104351602,-2.2767040645268035,5.0518827681966,-1.394264905809065,-0.6638459056065286,1.237776554795477,-0.859866780930257,-3.1760253597121895,2.685219525330929,-0.07222060125668829,2.7511369027720374,-2.2100849806831966,-1.4762368416854916,-3.2459019617473324,-1.4690378257604189,-0.7710215357864861,-3.383557918203553,-1.5936899408492569,-2.2978714538389378,-2.8618844763139064,-1.8936629461816652,-2.6509265961735595,0.28157231782210945,-2.3103984122621184,-2.580708283834639,3.53934836844149,0.6335553740557196,4.010897742777345,1.1941607270331407,-3.8634045323117263,3.507931724849392,2.1043271831068093,-1.2077860665713407,4.301633980298845,-2.0508536115404996,-2.461912969158989,-1.0119069638030191,-1.3117762959504713,-0.6691281415689373,-2.0591332615638223,-2.680913499783395,0.39867954331662575,-0.6914655899502348,-1.644236156027097,-2.2632701544944442,-3.5882456951445865,-2.8170216465412086,-2.6209657894844502,-3.0830851269297224,-4.1681909662384875,3.9610451403734936,-1.032944856422171,-1.6839709772165685,-5.380813892160441,-1.286509427096767,-1.683596003885618,2.911242400310808,-2.325299807591161,2.1428177418449565,2.923216127114609,-0.3654489646342423,-1.3314078986375537,0.3250490339368775,-0.8956750576164308,1.9146477898250638,-0.39023690503252817,-2.057376723007222,1.4918724639847656,-2.8697513699571675,-3.6789262126011186,1.3733874822316021,0.17359750633842133,0.3948484594257729,1.7020387446185095,-1.0815386059805292,-0.3857590059764064,-2.657889730688414,-2.2474093965023334,2.2499442935502416,-3.201407212608145,-0.3576455700381594,-2.264697289531189,-0.9565605329221673,0.1338529012080484,3.3587381278235497,-0.28507233399439685,1.5098110408346788,3.087858969321216,-1.6443526420983716,-2.073440650135209,-2.515692528733814,0.2271173410357866,-1.277604001783133,-2.866009836977637,-2.965829450832173,-3.0955998779622313,-1.5919375724958085,-1.868246842971706,1.9897400705789343,-0.6710789915737734,-3.324383063930511,1.8164203839447923,-2.932705183707925,1.6684820108882568,-5.2994731744562396,0.30699139074342235,-3.245834521148529,3.878423352796027,-0.7308015500413824,-2.1151898080781404,-3.0713296366737337,-3.590848455089119,-3.9153755554943266,-1.522370040895451,-0.6699275482746385,-2.1169001824516394,-1.0336205970752426,-1.1101888278691243,-2.049891418006909,-1.8866670198904496,-3.142843328280739,1.2407390773873728,-0.2170847223686862,-1.2618955691297666,-2.169236071469499,-4.255175930151202,-2.390959760824947,3.9285326219366152,0.38359599328238164,-4.727106257572254,2.109952624985465,3.617017902561098,2.0788585261219374,4.524202367940603,2.1634332241198435,-2.949385494528362,3.3900018763255044,-1.9783755211767566,-0.6672304824298549,2.42966651599149,-3.5790860169149665,-3.358031316632538,-2.703062921040935,-1.9495886263458158,-3.003927629059968,-0.5435826600987508,-0.23236149176658158,1.4893805705647054,-2.0078732188297663,-3.1113962907681243,3.597230209594352,-1.0796678001277331,-3.613805058876081,-3.570215145987625,0.5995147118731856,-0.09728778402619635,-3.996872759155516,-0.3283716639730103,-2.142643071230451,-3.062584040380299,-0.003692296054402312,-0.7106636052228051,-2.350819480249788,-3.1586370693237784,-0.96245107204488,0.5653882943588422,0.293958743121794,1.947725786567673,-3.019563533865021,1.5823478078100612,1.3497877778730656,-3.197231881457881,-1.4274279457508314,-1.9254950507159145,-3.1259480633017924,-1.508165522222487,-1.0429260414906378,-1.4345324394342132,-2.283226694343912,-2.683209684556121,-3.6956510568223,-2.960575083970346,-3.8027782790787676,0.2258211392202462,2.6258754658699646,0.5925825165382171,1.7385672710822457,3.8904304561179646,3.068561064848451,-1.4250017264665136,-1.345642385466524,-0.2679435413284422,1.3575160429644004,-1.053924891658362,-0.4119341479967389,-1.7484572407628967,-1.7128916948559936,0.5201189122534651,-2.935484006170148,-2.6766822509928794,-0.4975957886149246,-2.1999206999736822,-0.9220280596824295,-4.290198432029839,1.7238569947135107,1.5772302847965545,-3.7270630105166878,0.4459236206787284,-2.347451292683352,-3.4977481076376993,-1.8808658132885971,0.5847831537305191,4.7467818361238105,0.4875101780086737,1.6193348329483237,-0.7678568994133392,0.7117200559089585,3.2255084229017825,3.064638295452724,-1.8422252314925758,-1.638551723930714,-2.539991435500194,-4.287291596811019,-3.0529198637426176,0.8233160186538481,-0.2615638770211528,-1.485157285793307,-1.5245720205706468,-1.634721367635889,-2.890361440545119,3.411894503057338,2.4987980441850413,-1.980064745852667,-0.9510846037506784,2.854854678673813,-3.9282444956398224,-1.6114749474734482,4.860621766134245,-4.80235191985204,-3.642833331690617,-2.0923977037804864,0.9446637244074175,-3.384203455435943,3.3026622334526854,-0.9344893136005538,0.3901326724866581,3.7974296165306565,-0.517418564143186,-4.734864707041918,-3.3002402256934484,-2.080473671155738,-1.4817940837232222,-3.4496328523987576,0.2210226425904415,-2.013899171318538,-2.0080286786734076,-0.43393973058179003,0.9581761760717549,0.34800631965565026,-2.7493410857197866,-4.355369728237446,-0.6939663836256249,-0.4572472617438687,-1.9041142250093708,-3.7189404689165486,5.196566844947286,-0.12205552495747302,-0.20518805643723767,-1.4065750007529354,-2.6925988197595787,0.874578137435956,-4.7946460493374605,0.22644025940098375,2.2933799715116736,2.6636156561559448,-0.6459281996454024,4.718144252924472,-1.3865344853510624,-0.9721444034925992,-3.2708277042339735,-2.201564329551653,3.006153890118546,3.1893524039984076,-2.7073384623828285,-3.343146060595004,4.083720059087199],"z":[-3.024380865148258,-1.4745321677232168,-2.771873515500858,-1.4431803366766576,-3.710050736783301,-3.5612643967793107,-0.21248940038938252,-2.955925429000126,1.0091244846875032,-1.0685318211773094,-3.048171685828143,-1.1949852093488167,-0.600036597874875,-3.3118557132896145,-2.8769044812496056,-1.8193561707357164,-1.22033640834977,-2.7411672720283464,-1.999294826317897,-3.0685685422447997,-0.1388267657943079,-2.5576560582053687,-3.034894191795837,-3.5456245719558,-0.9788203344648099,-3.0740685721214995,-0.9829172990688775,-3.3235200176118456,-2.329777512803819,-2.2863424674813935,1.2394410641165456,-0.9036697717009198,0.3291484269188936,-1.8371790108602108,-2.8231587925408594,-2.3523493593528353,-2.604810391216215,-1.9707992884625558,2.32691039856277,-1.7609910223916951,-0.4710498741445166,-1.449001002578513,-2.915441483542794,-0.7611267362934931,-0.8576829190990938,-1.2622000386896015,-3.6293235775635373,-2.770558788114912,-2.638240246140847,2.4453968951403917,0.13581250731604833,0.6569800267085012,-3.5116031914393986,-2.8930169948780606,-2.0716125145071853,-1.9664566177849174,-2.729348218244126,-2.0279845427149086,2.079596668163653,1.1792906537819843,-0.7280409620800489,-1.794398274966857,0.32083626714142766,-0.6966523197743356,-2.231748555461533,-2.2768187516146923,1.9745195009629253,-2.8037707853802045,-0.35936254897899195,-1.66084696530445,2.440255882528455,0.17570926521058866,-2.3006843123322485,-2.0461003054108438,-1.5383249917983195,-0.10559973273648102,-0.9955042685706507,-1.2956353725399048,-1.5307040275661692,-1.3488521921351777,0.36345259257248813,-2.494457935397639,-1.5122031524566006,-2.618717882665945,-1.2445377553066093,-1.9359475786169735,-0.8896414398216063,-0.829055738719907,-2.407426249440805,-2.926313469310611,-1.2374332487917339,-0.9418735934161829,-2.7712205969127814,1.2570368860658518,-0.533321262666643,-2.869010649008662,-1.7078410809856233,-0.8656625394068891,-1.0204929324385055,-2.4141723966248656,-2.9846702455701073,-1.422843703781341,-1.127164808369568,0.9868147070508068,0.8928597856203184,-2.767897376420315,2.1055951800245825,-1.706212918216238,-3.9524561366926023,0.890071411914534,-2.2204548676068896,-2.2617772267854455,-3.3197928942286117,-2.709775528350492,1.5967669048060873,0.5004936928706943,-1.0164504735520983,-3.220281347904676,-2.904681803689215,-3.2960442154611056,-2.4385936138203705,-2.4048709533639023,-1.9690168669858037,-3.5106271366178023,-2.3661749339575486,-3.0533920084171484,-0.7634735453502159,-1.4111558618078486,0.8044619328403206,-2.611662219366356,-2.754269346962142,-2.7166627655598266,0.7580041276430812,-3.4695941942099138,-0.4571354477561772,-2.8231536962546695,0.05225008200672063,-1.5630412765635586,-1.8096770659880255,-2.4619623919101863,0.38706976642826446,-3.2383011551266874,-0.5916897080205173,-3.36089154616803,-1.1476386017406215,-1.407489627376892,-1.1793661729358191,-1.0963982951039968,-2.567920085247869,1.6437864405271412,-1.681099053923546,-0.7237586942550698,-3.5757771897430053,1.0962649153247455,-4.255833707053857,-0.7648072153650867,-3.2311130784119815,-2.8894625219487513,-3.26385488512352,-1.6247940510288332,-0.8428932564247135,-0.5447251547376649,-1.442638362290837,-3.113166825293113,1.4322930068849793,-1.1598946760982976,1.419965589316761,-2.785329574583775,-2.495502077081966,-2.1108509165388707,-1.5222026930686003,-3.4386527000717444,-0.600525528171312,-2.233334395806453,-2.782218788761997,-2.6493947626766183,-1.1654886311197616,-2.7493398999862957,-2.597762541440558,-2.0847150979353946,1.6446801797464827,0.4696938470443302,-0.32098962815092646,0.372031691384661,-0.2719599012142563,-0.34568256153295557,-2.1392799008933774,-2.5731074152710454,-4.292596873007003,-2.0203992956086827,1.2759739989742411,-1.9424456217778867,-3.190005934639439,-1.8676136986815766,-2.6343364870779857,-2.1910672249896437,-2.404903207035795,-0.8181238536929079,2.349673169736365,-2.8836399938491493,-1.1802627936302394,2.3503528404805496,-1.8616505738970999,-3.00649134781462,-4.07409384203776,-1.7088574992240073,-2.1441910405217457,1.9082023397246044,-0.9306654748721485,-1.8537733454159908,-1.1365567388927245,-1.6419948156442472,0.4390657653722304,-0.34727069191737797,-5.3608935024517965,-0.6083173066844809,-0.3721853688995698,-0.4467654894081254,-2.674377467118465,-2.9083670683764145,-2.901439973195975,-2.6238788499432197,-1.3766836510275027,-1.5359303242032492,-1.0556472697118564,-2.185651867062556,-2.1671139722252746,-3.337393242533754,-2.132405185338557,1.9374354904005917,-3.12683368350687,-2.527300342055932,-3.0537821608908393,-2.9287152911379524,-0.06728082261969076,-2.6476248624258334,2.6113621893013135,-1.7152879998903332,-1.6732939343178697,0.9438592010452632,-1.0565092331167354,-1.944832963329122,0.08701910352731958,-3.0511758390495136,-1.5038018801845898,-2.0325446321183005,-2.7505549523278203,-0.5822840259339821,-3.765875046495811,-2.2343236245927063,-1.4630631133922438,-2.5166108045193294,-2.2067690738513335,-1.7392754357247817,-2.243546492588952,-3.5002572756925363,-0.8773346037316202,-4.40109707132454,-3.905506008555409,1.512703828161781,-2.269089546051642,-1.2579617917276316,0.025345792845168934,-1.6645422453767813,0.4959833173719135,1.2281688681701595,-2.394063798535855,2.6534689144164925,0.16637276528374542,-2.6668071129256963,-2.698274729265997,-3.033366729018471,-2.1299415004589153,-2.816626290239852,-1.066620670270297,-1.57478126786096,-2.9205813397233507,-2.0558049447569995,-2.0675948681831717,-2.4788789310075092,-3.65736250086107,-2.634467343368,-1.6484997112364996,-1.435219423817436,-0.651200186877793,-1.9787318503045752,-3.954918967192255,-2.290692916285228,-1.1255760046710865,-1.8479412954781675,0.7852626727518085,-1.903865680452077,0.3548629179849104,-0.9869140080899483,-4.034975937047583,-2.628700605324851,-1.0879967049364936,-1.885917223826613,-0.31504436888580367,0.2712271166206303,-2.9536632713511004,0.9696411707710771,-2.4477219599214237,-2.1151460885381788,-2.6557473084011676,-3.2185153762997962,-3.2170937946164098,1.373784833950758,-1.955345089335718,-0.7480480766878743,-2.0994173908499563,-1.4606320740697414,-1.8299393036984843,-2.497301559819103,-3.1205995037548497,-2.190876881592063,-2.3153635650582265,-0.9642530295389411,-2.5673622943237833,-0.43946986717889464,0.2340256512853302,0.17846281182582127,-0.7162549491139618,-2.3020883378938755,-2.495816973401475,-2.334594653230059,-2.3014085919602287,-1.9605424806081566,-2.6856984804513924,-0.4993081301079309,-0.5834973780096481,-2.5293163762609856,0.9134734540366088,-2.462631226648832,-2.329501195977202,-2.8832260512096957,-3.450246357242132,-1.629524070470844,-1.7223642681942497,-0.21299075077274895,-1.8431657957642664,-0.6767615894261672,-1.5628627945658724,-1.6652540310350867,-0.3813355145310776,-3.5107225703673373,-2.8279753650962887,-2.8530744640647434,-2.303314073183832,-1.105863539746537,-3.3824588491562766,-0.29517731915720974,-2.229655019072948,-1.942844296119674,-2.064385388230023,-3.299951219009125,-1.449465929774024,-0.29683401524154396,-3.4058188718843505,-1.8816679731968038,-1.5621585886422222,1.750154282573116,-0.42741400408846153,-1.5210377259259076,-2.1814186860616585,0.13975359580487756,0.669188614220657,1.430412210317577,1.4339015701404576,-1.7906563651334972,1.537372550363895,-4.187948707452456,-1.4325312486390342,-0.2929936367142779,-2.9017699204609304,-0.748412224718004,-2.8844018498232904,-3.076828933461387,-2.063276757265125,-1.6278181521220236,-1.6736714630748328,1.5242860235792142,-1.8416865996574099,-1.307112511627371,1.2674799983087877,-1.89310399479945,-2.2136594939356105,-1.8901205186111543,-2.5888131339799108,-0.34722541500740184,-2.551438330370888,-1.5316847715857798,-3.018513230192145,-2.323005254837684,-0.3990683765230411,-0.4073742639304536,-2.8461713877857724,-2.918295758317374,-2.816080590460486,0.65645186998099,-0.5355775115697717,1.203732845544844,-1.9614452254083419,0.06741603132495283,-2.0293137817538884,-1.9318225971009102,-3.199282926216221,-1.6564832442109543,-2.251914560116729,-1.558887565221792,-3.001325998646715,-2.4159191063779386,-2.6240434907882335,-1.5174339337760552,-1.9116092804988924,-2.025185464244062,-2.5284977010924035,-0.5130870055168321,-1.1620360699323184,-3.267863882402904,1.9030469784542225,-1.2443384287915953,-1.019556773702256,-1.691991948709307,-2.0923875374193557,-2.465716536143841,-1.2618682428269086,-1.3557045615328793,-1.2220448529126797,-3.0796090617830605,0.08559415670268468,-3.038271581182372,-1.1973339898917705,-3.8188257892633595,-1.0472214774717132,-1.8692208067367166,-2.16672006223209,-2.594113271298398,1.3547663049924141,-2.239431013291506,-2.403402766424061,-1.316625774604553,-2.090186987455394,-2.539870855460402,-1.4849043621444442,-3.4068161199305664,0.6802159316719147,-0.7766198730663395,-3.0149120421721403,-2.3069419992034916,0.11921794817003385,0.9482689335928566,-3.104266713287173,-2.0908372758061016,-2.2985682141955697,-3.1168278261748683,-2.0559656183984196,-2.654876607068996,-1.0500771313663289,-0.839084366356482,-2.458771214622439,-2.214358553918917,-2.471531195754141,-1.7635608378699879,-0.836065974502877,-2.874053031630633,-1.3316077384670306,-2.646685208827443,1.8858277055539623,-2.7033172488361785,-1.0172759163267642,1.4758649714732357,-2.0729597568216196,-2.771562815865086,-2.5567949805577532,-0.6564183903603861,-2.810807187515811,1.2697152502932845,-0.8028073523414131,-2.219578771060161,-1.5834298823585047,-0.9357557165845534,-2.0241280926078162,-2.342666128817052,-2.2259802201524006,-2.516592294130825,-1.5472656354189276,-1.4281220310123834,-1.3185916339534425,-1.2545496964397949,-1.1824476086268882,-0.9455506873077766,-2.5099184678302486,-1.5343773786787587,-2.389377236650045,-2.8768056840715137,-0.7687243954231292,-2.0004708828954234,-3.1324626072947495,2.8075963482140684,-2.5552470058014856,-0.2238001244371705,-1.3669916012318355,-3.8002119539820356,-1.261331959152335,-2.7243366047986526,-1.1974896379196138,1.7517342664356774,1.1203340085049096,-0.8973751527168861,2.3289024719776625,-0.8060311548986216,-2.297479417475574,-1.6081860388217477,-3.4872054838245883,0.9532918608182926,1.7602095893246528,-3.140942450218554,-3.2446286457914324,-3.1691182248552425],"type":"scatter3d"},{"customdata":[[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0]],"hovertemplate":"class=T-shirt\u002ftop\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"T-shirt\u002ftop","marker":{"color":"#00cc96","symbol":"circle","size":5},"mode":"markers","name":"T-shirt\u002ftop","scene":"scene","showlegend":true,"x":[-3.4149240559326044,-3.6351300216337625,-5.9943794709403475,-4.3478513568011365,-3.048489039670732,-7.182601826188073,0.6249549456169832,-3.9658163688898806,-8.046066723316738,1.908233813973985,-2.0153489016190194,-4.4243320792301795,-1.7411711422433882,-4.843143385740295,5.297667768255682,-5.144628894368834,-8.071117169587435,-5.307661708748349,-3.247011112606892,-5.705270702579057,-5.1729331967598045,0.9384014982784223,3.908468213642396,-3.54770902275428,-3.5324622094547253,-6.217920546481208,-1.9926252204544075,-4.169079881852123,-2.6621692052933446,-6.678312324880359,-2.6316854830303744,0.3189384653401921,-6.3020069190112515,-1.2755178224714085,1.9083341350112868,-3.694778929732747,-4.13051850213623,-8.170867798471184,-1.7961745897193793,-5.550820246989924,-3.044520179646569,-2.2409460995228114,-7.105542348351748,2.714602250071422,-4.9830744827858355,-2.799725114992046,-5.511551382951123,-4.603421979426219,-4.237687121419839,-5.195087704278311,0.3267246299902387,-3.2900396757435337,1.4476492047022869,-4.771442317622376,-0.21095254539183433,-0.7566577937241089,-6.209444542924636,-6.860240501157551,-5.517592891164032,-5.422896790439994,-5.856038077548794,1.3387242977885756,-4.56123126865002,-2.5411079225742363,-6.085597328832406,-7.894610664730861,-3.2618392288681766,0.08593745576641419,-4.084629103591491,-2.4942053517621665,-7.9786281106347605,-0.444548690371952,-1.530998485237048,-7.603736377516755,-8.04312707728095,-3.8621645134157254,-4.076095470863412,-6.062775455207069,-5.446289830511722,-6.081057957970013,-6.190280554335143,-4.556213088571091,-6.136564768383645,0.6416927254516658,-8.80524236591679,-3.1947341710785433,-8.841161600221872,-1.172160269860467,1.3016041119865698,-1.5849262996380793,-1.1528296285838795,5.71159030433538,6.060810347427745,0.09040708952444607,-0.9006630552387966,-5.330190607479775,-0.457216309432165,-7.66164794563021,-0.10321061868342002,-5.226785004348578,-1.3387456617855262,-4.952311264370922,2.892834804628649,2.4001894080413777,-3.941887050834158,-4.404988564322761,-0.4870767160556564,1.8083155952081684,-6.267573086593044,1.5210921211449056,2.249488768419431,-5.071537855589592,-2.50438033724138,0.9330958464667941,-4.1585016048351235,-7.109343401068798,-8.696132516051703,1.2558404406450026,-0.04831507073283694,3.2329998285481274,4.724702830619941,-1.095731814753736,1.2061851294121382,0.27201801787250984,-2.00685309562314,-7.008732554261709,-6.926665936459888,4.51961541585519,-3.1019261679454124,-8.233347956133095,-4.323807460134226,-6.715696383546997,0.149892688219726,-0.7529957639618517,-6.922389754963363,-4.981952233476843,-0.8636996937836064,-2.8308766266855856,-1.7233491570503041,-2.4056403693607797,-4.580640915369102,0.8977876910910157,-6.901625615922738,-8.363304307976383,4.361256701416692,-7.452345556766784,-4.4241565884658,-1.3336821786866577,2.50618542507115,3.7629598189250872,-5.112794278145715,-2.1123286024704346,-2.256739741966684,1.561163681489824,6.803338926971347,0.7452892909085941,-2.769040041948104,-1.4372845382299329,-6.779004710229803,-8.393091077060753,-1.3699929193855787,-0.07698830561360026,3.49067331459258,-0.6922036922347516,-3.4931076805745964,2.6267528175102783,-3.240250680179581,-6.005061695452355,-0.022061172497940235,1.2468339561234185,-4.748720419425639,-0.5664457879613619,3.198424205225813,3.9208822842008644,-5.791933058335851,-7.480735333210281,2.5893083282711142,-5.014505849534652,-4.131901883905713,-0.27541655677173416,-4.79116140825189,-6.71423758460411,2.195616602407016,5.1816593221021225,-3.729723988620158,0.7795445531554348,-7.305140139556802,-3.0395201604529656,-8.0211107489663,-3.701175214052239,-3.671828718159951,-1.3929264843698712,-1.5346904162950659,-6.744417782336933,-3.695539874166533,-3.8835850229101547,-6.355028530816245,-7.095024512726595,-3.3070275964655473,5.440052713575294,0.7160887815793731,2.5613040512404184,3.9112113162048603,-5.399321351354765,-3.305096381362588,-3.3694019345994835,-7.916780644203379,-4.307853531590075,-3.623459248152056,-3.0536557413517067,1.0479939042345563,4.600916433000346,5.2775649266765035,2.4755125082334124,-2.611468890284373,-1.197244043557658,-3.648410322551863,0.6311949102233398,-0.8785313707280428,-6.8333174680305655,-3.8329030410960043,-5.02030363439152,3.506182471661965,-7.4838776764965,1.1618101974421824,-2.982032652895981,5.210994435224727,-7.287443721242423,-5.74418407772495,4.287701141218568,-5.042018766700682,-6.536458722971121,3.0768765092133066,-4.300428563887743,-3.167712245487718,-6.293361131545468,6.4044597415660816,-4.114667285973527,-3.686736440040917,-4.321249946907109,-7.462584097995392,-6.471957130605542,-5.412931082025654,0.5751596995232386,0.8767071920449457,-0.5669021704226621,0.1897675699773288,-5.491717473114938,2.222741165989376,-2.737871081702155,-1.5266956314296685,-2.800824665018366,-1.9254359664458738,-3.895241666364796,-4.208186449144318,-1.1389212259685988,-7.919661151654276,0.6177449584992971,3.023635451131409,-2.759578626929277,-4.84028409432805,-0.3559806671032181,-4.177625842996563,0.39911863101333955,5.209559533775605,-2.761549478038372,-1.2889307747311993,-5.230650175034629,2.964629377514991,-7.547277059413127,-2.9797067657575194,-1.8082870769156147,-3.3385487514872625,-2.295629883246798,-1.0053712440959914,0.572101264512862,-2.4203344393298103,-3.7423773163674805,3.2429245180468365,-4.3608750122808,-5.106352236895182,3.842380653119474,-8.742227749560048,1.234264885508699,-2.781147825249199,-6.786736366127212,3.5525297081374156,-1.1557500776945924,-6.1907528107735645,-5.057527296588797,-4.48236194279417,-9.308601942382081,-6.289315940379392,2.0201008685929196,-3.155565607192247,4.329875926838272,2.9698543091765477,0.1657239860739591,-2.7624725244024937,-8.22392894675965,3.4898421015450696,-1.6255451517515571,-2.4525054876104777,2.263662221862129,1.4585245272726126,0.7534819395542979,4.44140505959488,1.9769918473841885,-6.72633289437724,-4.301413567526665,-1.9248015734960349,-3.971545289076669,0.19588899359115922,-4.0305816511267,-5.222902475294379,-6.217646688447358,1.7495521506696377,-5.020080285125888,0.996894280056686,2.542884749751348,-6.375223333750143,3.5411282937086344,-7.502564955486435,-4.212193620434526,3.65561531239375,-2.6423470819847994,4.527229338290987,-6.614069216594189,-6.104794446426657,-0.21155965808483007,-1.9786728118271528,-4.770165195419212,-5.428806809868121,-1.8924892606226515,-4.4674818764051,-1.8856615019108984,-5.377580415367315,-9.26201073696392,-6.014744796674987,3.645117211121369,3.88571146956471,-1.4036881429598176,-4.036695215276936,-5.6089056946597,1.9142769038984067,-7.197197007408739,0.9238277350959366,1.8487429260984796,0.33128189743817854,1.586064348855743,-3.094647813650585,-6.742945384849067,-8.554437332075047,-2.003163204759689,5.226078979965005,-5.086857058801049,1.9085218608318253,-3.48423300724783,-2.7637246838453153,5.3774856290411615,1.778420648541803,-7.4397838836540195,3.6767612004927965,-7.470172418045913,-2.664575427112565,-6.436236507351756,-7.338086646377138,-0.5699969277676793,-0.4665994288598877,-5.6377177456934335,-5.664210564600712,-4.033550131559883,-3.1689139071652677,-0.6980929033104479,5.167484288151605,-6.329929023430086,-4.348360499299285,-3.9413759190993574,0.7072840185197925,-5.026581145375155,-4.69187980738242,-7.960485603361234,-7.529042063494273,0.0658647889076156,0.9714989935829652,-8.99765786755765,2.988729290704542,-5.95259611078416,1.586240463497549,-8.343926574629444,-5.569155065840565,0.27429718886965837,-2.998625684604338,-2.477786342836561,1.8371206411428573,-6.095950484833548,-1.837722977269931,2.8611634783501034,-8.141805914051812,-0.8376183493844747,-1.0992307548039082,-7.271834368459334,-7.7012814741310365,-1.3007105998579647,-5.2825134573153765,-3.6635775555054257,6.351790263709336,1.1569449031323296,-7.151754411939644,-6.4775123640058,-4.7226746874542025,2.4388301915428463,-2.951390572851311,-5.621824577709631,-4.9639379420828345,0.987893987017345,3.1693367771251286,-3.8946325417327223,-3.3454303691573877,2.4616848326203873,-3.5041946810296603,-3.528190858585379,-0.0734695971290628,-5.0632750074521296,-1.3977689186681628,-4.235837484695132,-1.9353468576800035,-3.1346383890716365,-5.671505652810137,-4.88164725086497,-4.141787942185181,3.4588653921291965,4.901159608471046,-5.500087440692595,-3.8989657353867075,3.618301646741367,2.474644759685771,2.775836313706531,-3.2044870618903585,-6.93991347835839,-6.019342860396881,1.7728056124599882,-0.9909479181143712,3.309289811864842,-6.974220963355566,0.6688504465550451,3.8628042351743987,-5.461071808416594,2.3700252580877574,-5.8909690137063215,0.8751919872745001,0.7964661960977986,-4.8920784727674285,-3.981674836467043,-6.68133871419436,-5.952544498303695,0.24077085342559265,-6.1919141249453284,4.020679392313918,-3.1888106492914656,-2.1379787436369617,-5.8457618446162165,-4.698277308952422,5.466198419629198,-3.069021307614024,-6.513830920361575,-7.430139412923269,1.0971282881720332,-2.539478144540941,-2.7485027391048886,-3.14499446006183,-0.18118205943460539,-1.7587891684505774,-4.093764150898572,-0.8922248513186197,-4.556199709677793,-4.060349352066248,-6.05007528776789,0.25766702590606283,4.265438562892349,5.820216113667054,5.340836897625786,2.214631675753141,-2.0440979213127664,-6.700177866037185,-0.004509292937227643,-8.188626828690762,-7.341836065944026,-0.7378590945299882,-5.5854825853542085,-8.014172964227493],"y":[3.72130830217369,0.2646291676806226,0.3556386721631636,3.252035922556516,-6.0172839514283245,1.060064138436444,-1.711588577848253,2.543618238924163,2.3235131965714184,2.952203548689119,0.7674417460871574,0.14090504201008563,2.3586862287253485,1.7768433530176428,3.3594621976384937,4.5784008106327585,1.3133619625079636,3.4961910492117227,2.257873957751457,3.0720935820833892,1.2702273196850757,1.2929973298772859,2.5979783352407786,3.4598517003832203,3.481418698931064,2.7882452025364017,4.532918412344592,3.8889082386486304,1.8264591703254351,0.9210396706913883,3.092742982548096,2.735671701127366,2.5650831426341614,2.798033137427898,3.042097180926884,3.3935335912098203,2.8962155038067965,-0.5892064353599082,3.6377255490001428,1.5598839965420779,2.483948955532102,3.9944727633893145,2.856238364361554,3.3278968255289985,1.9576573295868465,2.9655193911927573,3.7857815344763877,3.005804309624587,2.600449902120366,3.325480504335257,0.8159621187398096,-2.1156456010974662,3.0725068688705814,2.5313435309973404,3.7104300664209093,2.9631956061885494,0.5363932466807655,2.28785450976169,4.075784942618026,1.5129250400113083,2.7589773555973176,2.6579708448382364,2.7338803654508,5.031274433038731,2.560379053773722,2.1409973023010114,3.336614890129068,4.082457637871258,3.5091108014042836,3.0257900957077943,2.3081689326669514,3.3264143222078246,3.555392473320471,1.1942958066190945,0.21888175621050074,1.2716949700089695,2.245510930911742,1.5429535816693078,2.6580963069265993,2.6946994488634686,2.1642668364387556,2.004515129046932,3.457355959154432,2.042174047952288,-6.630424466585675,4.035365336446514,1.6063422311721847,2.5370473445962984,3.7116413821667824,2.165157763471903,3.561869885433707,3.3455658759983185,3.3435361944852873,2.565461969351282,3.530125513784725,4.260051397869808,2.928501336478417,2.0456943180860803,3.1481709147495813,2.6860394419838696,4.404217862681362,1.8275676483446597,2.7074753342350952,2.1604967325296838,3.2194574727958516,2.7806016400281877,3.5648464701641935,3.4121807318935,3.7132080774462892,2.4878494304108507,3.132704094994941,2.947579904579261,3.3288068067579575,2.4569821395195306,-2.9729055348747697,2.3981933671105087,0.7199551000054276,2.0952647903510844,3.022601045573746,1.3056588770093378,3.117952326205827,2.6002664529225474,3.2383207135505354,3.5886328246321377,2.87877371033141,2.366164837404726,3.683735911129531,3.4158634007320225,3.014950328778666,0.22705626756440284,3.1128775289164583,1.996913203865355,3.3021411500789584,1.4214137549241863,4.231036426061525,1.6723941509505933,3.4218019122503023,4.222081373659847,3.2117266987564683,2.626748992344257,-1.2801841397790525,-1.5012917841947782,3.2348472539215263,-3.002274510706278,3.2690474822101514,3.84855165982137,3.548388153833446,3.101071579492696,3.919191066321391,2.7130227246404974,3.2107109129807245,-0.9840820093639848,4.6435349520694365,3.4926281276370212,4.000097544541784,-2.002218442089751,3.119447256024578,1.8303726234817972,-3.525868191669467,0.35280590832381453,2.9925811271720897,3.4073973349729703,2.905119679252106,3.8278679080110343,2.19299642899797,3.4235557853440204,3.0012515868707523,2.7669592484527366,3.059951837107966,2.3969872329603996,3.3941804602882586,4.384343602895656,3.4498319164840976,3.0541522867256226,2.550150693003225,2.924564384541937,3.963230816477092,2.387589607607828,3.1693691355119644,2.566022685412612,4.178596368912996,3.829645078863394,3.361642325885453,4.520207667071095,2.2612944288348307,2.760484158094967,2.1522339712536382,-0.9878340977797283,-3.2051052704865985,3.1371776778004543,2.4935168432486177,-0.36487561852442224,3.43480140843254,2.0529599068295243,2.3704277147408694,3.843052098981577,-0.23210355722416223,3.5252551150416136,3.932410166439568,2.652536582533921,3.7057286155662377,3.9027660275044194,3.3627145495123045,2.4899210558605906,-1.6701174498403792,2.7545581158974026,0.7098861252476536,3.3238388784993513,2.058751844240929,2.9419567302767353,1.640877245658514,3.031166789980255,3.4559404718869224,3.107167084287484,1.2899748462762683,2.8352273088092677,3.332121030148111,3.5310077390749752,2.808221941705111,3.0662254858616675,3.30376919030721,2.7586889304114477,3.102707427147577,3.424298671777718,2.536604452454473,3.3555681571515845,3.7069101574673464,3.2113329960684136,2.9446185223088395,3.940947472794915,0.6580194277607929,2.8476010455624956,2.876091454836541,1.2029639750942274,1.7213055744077466,-1.1658211205400413,3.283250381766788,3.946342142611275,1.0492886352664506,2.864865177402083,1.9304934180914155,1.566436194055929,2.8402754521037283,3.5111451382624037,2.609554634247771,3.552797525776221,2.768993045918272,2.638295603973236,2.2605943382582554,1.0586908146169116,3.5710524354566267,2.5143755390593703,2.6927425164776935,2.7167648302168264,2.7619706271605513,1.5752605095285268,2.598984927024269,2.7056125314053836,3.6868311905390967,-0.7637076808077544,2.0762536092999264,4.243837668530513,2.6869813329222816,3.201380569433085,3.401337273507546,2.2396533004019186,3.7585661280989395,1.0393968881280735,2.2699023014160304,3.1230738083820047,1.9358028376793819,2.142695940848658,3.5864525692334244,2.5912308193693168,1.6857244812189387,2.6147015530921354,2.6271262931336663,3.4100413221610006,3.0827446701166012,3.3514395811026074,1.0292639744813297,2.42678945171379,0.5481450407272448,2.5551440965094128,-0.8953825539066724,3.3505960858284065,3.4583721497548026,4.051458543796915,1.791352000778789,1.672502338835468,2.0567542478967584,-4.900907432577655,2.753315842327539,3.416133359262561,1.4194838791010604,2.956237221787819,3.5847196484399633,3.798281535086915,0.5098411172963272,2.6121407379289616,3.00423106453564,3.802369486882502,3.2815114962345926,2.6940551012312395,3.506056353257136,0.8274945471049127,2.9243768447585827,3.185048372061693,3.0493108014728887,2.6991562790339736,3.3299469045161807,-0.25270913785303917,1.821347818464802,3.102753936412745,2.464916651915893,3.155598548650512,3.3861757499932437,3.7583135177552145,2.7267834761229683,3.1110962359324397,0.11636041945922057,1.780670805002856,3.2540542059785538,4.023206060756579,3.6047106466954846,2.898473883348596,3.8984613125127363,2.1986221891786446,2.6570054346267558,-0.05962038582660536,3.59253043572418,1.2768341985682172,2.722922131707838,0.8008575489901412,2.624402368589248,4.35876045430171,2.5523436566875786,-4.408582726245505,2.229301265765234,2.941618007685516,1.458854942330492,2.660052404593114,-4.346866897492263,1.4788354007249882,2.6977141854492457,1.0876346436701747,3.2817952600414504,3.8072900503878295,3.76241162055781,3.0769989295855478,2.1734700705689245,2.624792636804728,1.8387699573750065,2.1189732235450647,2.7335245372979604,4.320419130912649,3.057298513177354,3.474138944762228,3.1956900722963826,3.437398744680769,2.698866590827443,0.6749337512778495,2.8950465594711563,2.5472173605500563,2.510576949466544,2.1690113396381556,2.6997281080336535,3.1486890419445,2.6642946283819335,3.5831747805080822,2.2691836987215215,3.8139545268615147,-0.11422517112722722,1.9440532968308422,2.731565408032825,5.100387393606346,2.3342771409940526,3.911826584712679,2.5752246148439646,2.1729081910815657,1.1829568198786355,1.7830051897271213,1.988829249762882,3.6059920655895534,2.555595590647266,1.4681994694261398,3.7444406733018765,2.9943530330181023,0.2392911879406068,2.0072789799531057,0.7081934588627334,3.531341727519391,3.2618751316264585,2.761283921532174,2.733172177911771,3.308133399743021,2.947721482513224,2.1387881324750193,-1.8358636990701398,3.798399457808228,2.4405443229072303,3.0037037359960914,1.901394292345744,3.9454646225522434,2.3571509038036567,3.338172095407041,2.6631390034115765,3.1923255566247444,3.384442598612705,2.6615939358540506,3.540328514496393,3.5213054199432188,3.731537486596736,2.4030491905407203,2.882938529114559,2.401373988543582,2.9157221387219234,2.7942154595066335,3.9693250083749185,1.332089786129972,2.7780049126079454,2.9075802316716293,2.985697532446595,1.7207680625463848,2.1552180497233038,5.220936752042047,2.5202297816899972,2.282791030323854,0.16521054558371903,0.2475196630403324,2.777503128320894,3.063723502953442,3.907222618088715,3.255460018225377,1.9841662911057256,2.9208788122621305,0.7128793588021922,2.570198649682116,4.113595024218186,3.275266495186297,4.54802645492298,3.249834263504673,1.891831617432752,3.0544149902297715,1.1754753497304082,3.8754550764769546,2.7730588829606653,-0.14751640958745735,4.58159081732043,3.4543219712680786,2.513903862860646,4.677789942234919,2.621382250221622,2.717400349889621,1.8894194903806691,2.777676764411311,2.7351524509686875,4.525811772462269,2.9985091999527156,2.4148636813553535,2.458689414345918,2.6734388790826866,1.0247624213339015,2.884940522396183,1.9274488614488925,4.051288398481627,2.6515820996699064,3.0765694934583636,4.379358565786405,2.754718567933332,2.832518687930955,3.9391868024629764,3.3519516763659682,3.2761923415835454,2.9919152374417557,2.576573356273673,2.8824482325974,3.116378198022909,1.7650838088144447,3.6139779876028135,3.079323429541408,3.287255793123683,3.052810841198351,2.3828363423367835,2.7084464986879473,2.821175554096882,2.4159194925373306,2.1508059602666565,2.6268456044803306,0.9020446637092323,0.6833939915710081],"z":[1.434070377613144,-0.7058675060213683,1.0897424145534242,0.9068912506815716,-0.4564718715622816,0.5842976623488289,-0.8564602080668572,0.3889510215827714,2.345388488616487,-1.1026772094536677,-0.5247678531946854,1.2885024290546634,1.0858399605353566,0.14574588578270248,-1.8801379143277535,2.421780941586375,1.2089490932594036,1.3037803825641177,-0.8690442530025608,0.8865829920659304,-0.2897928402384039,-2.7383654969837634,-0.5941614524512928,0.9877483682368513,1.0426276260760767,1.3920927963394882,2.0069748032915364,1.6281628704266697,0.6416214379727494,0.8380361408924497,0.21306078718882238,-0.8240818963868639,0.5387629311205269,-0.5615236089912191,-1.2955624546650775,1.407980693149731,0.8008807871151813,-0.4366093879533745,0.3782562026779449,0.7421774708443621,0.6691320307928341,0.8004106943459163,1.0399855563247602,-0.8992062354160514,1.7310752486565713,0.5050327417759813,2.5799399623856076,0.5522752147855099,0.2592055653655111,0.97676232358582,-1.8951491252358974,-2.2847848303449854,-1.4255433596140774,0.8597913924360127,0.38065992071435334,-1.0599250340229016,0.16674234246733075,1.48766053596904,2.7365683723457597,0.3742608404921334,1.3996944176034702,-1.0091270535920294,0.5607963125764573,1.77861596326413,1.2456381321191325,1.6239562069987614,0.7156910887512365,1.639627285494552,0.5222069756178911,-0.044158179256192934,1.888436881289964,-1.3406278305860237,-0.5396392657382577,0.4757188566732005,1.5488826658933788,-0.5152806933863262,0.7074470903878046,1.4195046550804022,0.593481525281994,2.5416021116908873,0.8846455937121274,0.8283741340271026,1.2051121939229037,-1.6728548425049665,0.7488580993592111,1.2820769729875636,1.3359865717961126,1.4696803950067787,-0.7727234185604968,-0.8325186906838427,0.04062859385404283,-2.2114580514374995,-2.3181494285980415,-0.6268543126847871,0.6533904777668813,1.4216704361897756,-0.16948640095408696,1.5848888224719184,-0.3832328184763326,0.7831701159807353,1.856804188719718,0.5504569705869063,-1.5682405795506558,-0.3533147559996514,0.2706213884821691,1.516274490314309,1.3844055532655677,-1.0421050486988734,1.4375731122006512,-0.29881913787780906,-1.4657803249797137,0.561953190331661,0.5540457901826872,-1.2870944908225324,0.3554908636742675,1.5209108317404454,1.8822520485083833,-1.3818323845113625,-0.12515057406474728,-0.9120540969890008,-1.8419228697904633,-0.7384353054187374,-0.723901832021174,0.5369756306913492,-0.9671556803678958,1.2500277661268293,2.063578661336662,-2.075487494373326,-0.2619121614416789,1.047815916695548,1.3940759345071059,1.3218891617589115,-0.5765236177502258,-0.6346770606937905,2.2888667395870526,0.6454952957948233,-0.5517071478054729,1.6658497574831135,1.7038618259738998,0.2762464312873421,-3.0617529221725697,-1.0605290848367879,1.6921844315260317,3.0059691078572413,-1.911152790555015,2.8698374865305074,-1.2208371946899719,1.2331503289505756,0.14021293267042978,-2.509785684014574,0.6987540294695118,-2.5021831466927735,2.252494744721493,-1.1867318268837455,-2.717205774829438,-0.11133423048782143,0.05431060707784539,0.42772622569937885,-0.2557838214359397,1.4009431023426906,-0.07070908338213037,0.4178950444847452,-1.6700258613171701,1.6144479365478155,-0.36747536368783956,-1.2408812243109713,0.3331145260350691,0.6886143893585969,-1.110257418172983,-1.2220687418114973,1.6292823146596822,1.7002431428073004,-0.6510857522603752,-2.4485794582389238,0.918652043019128,2.1643079297637193,-0.3402886588869601,0.7172755749089619,0.4586035227942095,1.4187237748512092,1.5593998219333411,1.9486924360133502,-1.4522904228952438,-2.175054550542166,-0.21361942112245574,-0.9108499507025005,1.4911978593166215,-1.1140540770169831,0.1921005537851951,0.7870689279943968,-1.1667283505706563,-1.0254855570381327,0.06205980947768568,1.6439099567153392,0.44740078633078084,2.020461082437807,-0.32648777711095267,1.3777110154798744,3.3638537402223876,-2.0344153834005225,-0.2655176418480845,0.10657700948736656,-1.9431128315636665,0.6431229402987705,-1.6522324974290525,0.5492949842585063,1.6221988547185162,0.5511429319512927,0.567207017997649,-0.12897010941252074,-0.7851352144079944,-1.4974713820083625,-2.08031008324848,-0.9069426929817112,-0.7363455585084112,2.4623397624478054,0.3517488642016826,-0.5742772126619266,-0.7284050116654647,1.8911057756659406,0.6676577910040686,0.3502447690520693,-1.8161626024377493,1.4863294829821354,-0.7668838569696792,0.7561357340980137,-1.7076257703649407,1.607041556188842,0.6585675362512765,-1.6050610521304471,-0.01667753958423982,1.209854577579929,-2.0873687589644367,-0.47427585397919286,0.6680139372742848,-1.2074607581849117,-2.3502935230372883,2.203736360124781,-1.767150024797775,1.1813203838987085,1.0690995333208102,1.5010931462030153,1.427293948567558,1.4736406358280432,-0.8665683777521445,-0.5348064409995518,-0.976102017878414,1.1579807645858646,-1.1615565970801125,-0.6621839476486518,0.553211017057269,1.9055128135093995,0.8889447442409159,0.6093728477063228,1.2801351200046924,-1.9447848820602693,2.0423677485490477,-0.09928512337748453,-0.9774355260855209,-0.9745667752256635,0.8045450193171836,1.5404112716613907,0.8215984063015589,-0.15560155616660315,-2.139456962489653,0.019646414681201495,1.6323431433395186,0.009386602791841364,-1.120657579555571,2.3435673175975658,-2.1542087938917165,-0.35613213580551545,1.4063321051357296,-0.08109282040497041,-2.354662742872655,-0.6533103194513071,0.18719822048568605,0.2568664275208503,-1.3149460304329952,0.5006917089897011,0.17879976821263294,-1.7165510726230597,0.8281165841729505,-0.23171979381676033,-2.5260308481355382,2.264058154024371,-1.3369914598982586,1.803011865549985,0.6778556622210964,-0.5260680866471303,1.1691583097865699,0.47280480789243845,2.169298360733724,-0.9986324548985431,-0.7016366422218459,-2.0554268255580186,-0.8104006368867754,-0.6636406295402755,-0.5993123496460145,2.031956433798334,-1.6978944901952737,-0.08302595366311148,0.3274579147156052,-1.154840788440651,-0.24660472859964452,-0.9957732631710895,-1.5630279926322865,-0.5571864387713994,1.5736421396990656,0.18705172732031972,0.1248839729180355,-0.8945197729748495,-2.5820968066211885,0.5761556468563267,0.6571206786950929,1.1043382958110832,-1.6311995882942107,2.8383492390632856,0.8079581996652915,-1.400577402313626,0.333880049101645,-2.8364611911133144,1.7751103297470594,0.893384014776931,-1.489193346963205,0.7284324792260203,-1.8015459132974379,1.0480123919849922,1.2906682876475692,-1.2125954286333225,0.14715899569598073,0.2653654480995461,1.6684563121976645,0.12069419478643734,-0.0828079940353243,1.274632294428794,0.5614920880273925,0.12526857134142552,1.489558185287691,-2.0782899810557676,-1.1336986838069678,-0.05382757041016414,-0.10460188565503094,0.1470392991194819,-1.091256063566151,-0.0534811005893875,-1.472943825212718,-0.07008603409359448,-0.009154014170173565,-1.1498050044709036,-1.6027759564073447,2.071961400790904,1.660368155031208,-1.0702205183817028,-1.414367049402511,1.4831951951939246,-1.3466217139537646,0.4589546867878874,-0.11719669343104908,-2.908679059093554,-0.9149151680610246,0.6048890621479489,-1.6482654362714186,1.5606636669868468,0.28600624253554063,1.138879388071307,0.7631846580478316,-0.20771338141707893,-0.11017682244011542,0.7055314678940561,1.19713461893417,0.9136176593782744,-1.1623790034254373,-1.4384206290663288,-1.9517720916398116,2.201204379312931,0.7749441804969719,2.087339081752603,-0.8279340762604765,0.9787794708175175,0.4407153241608541,0.9581534482077148,1.7134712338892033,-0.48026008519049984,-1.330397021198852,1.6287291980547598,-1.971253539765321,2.106493153903208,-2.2263988583592838,0.954047908544553,-0.46618940953486887,-0.6933824361690587,0.7731116037837555,0.34524343525170553,-0.2465308854175381,2.0908245841214854,-0.39836078752424553,-0.38934710310377824,0.1238452088914315,0.9000612191278927,-0.31451069114462066,1.4349792223891513,2.176895818703181,0.9586178750680261,0.9146545132338841,0.9175467193650272,-2.2381031048531357,-0.08156643041907716,1.197886755368415,0.8179304774427391,0.7370507885494022,-1.6624742174611427,0.3571734494525982,1.2846223705934707,1.4292381274792536,-1.1781249876659186,-1.8452299010471964,0.14215284840150474,1.4624289356042892,-1.9997439209552241,0.3652573730740726,0.584584429968997,-0.420263949321977,0.46472139478194796,0.07445724158197396,3.285596595092159,-0.10391218052973165,0.4352059731386951,1.0123341319563484,-0.9706557026667981,0.9346481187553415,-0.9187566693913811,-1.7843656394796519,1.011109103949634,0.19490084800804194,-2.0115901719805804,-2.0432593628404496,-1.265127474601268,0.18512290667741094,2.26151407814651,2.683560541947165,-1.1884675121783808,-0.788960877366022,-1.6244477094907956,1.235647652422534,1.422914499498857,-1.6333959432074932,0.8732157040557532,-0.7494418536407195,1.7464004790707681,-1.0927999032022642,0.21570612463889113,0.9187722874135593,0.24557326659842438,1.309210291058631,0.9281623542291065,-0.7445269706524097,2.3829494265224156,-1.8349987523993507,0.7994634900743718,3.420838438322947,1.5363019787072225,-1.028049145528815,-1.3522622954919112,-0.0861135985823152,1.5660400494443745,0.8912191676855536,-1.0680062024240746,1.5918528387587778,-0.7056650193550057,0.254524681116059,-0.26096786579758835,0.0429340733842841,0.9635628908858248,-0.5698363886471126,0.48227880685807306,1.1176785610074778,0.7244149694489792,-0.9343103795357944,-1.8980799346575283,-2.463402911884794,-1.3083241416803437,-1.8289758131261944,-0.33247377445439386,1.1759407556114279,-0.5659249463870689,2.0887543444171452,2.085294852311438,-0.2375184806016861,0.6784611287772676,-0.3386581666336651],"type":"scatter3d"},{"customdata":[[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2],[2]],"hovertemplate":"class=Pullover\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Pullover","marker":{"color":"#ab63fa","symbol":"circle","size":5},"mode":"markers","name":"Pullover","scene":"scene","showlegend":true,"x":[-0.04053456467888627,-3.1973951775713747,1.7399291906993763,-0.32007577963576084,-5.619934264795604,-0.8430122354643997,0.18814321418621388,-4.421608724787881,-9.660914463144177,-5.138426545665365,-5.21131434576646,1.0519230004642284,-1.6877401336453126,1.0492922101077657,-7.4226561127854715,-9.217322904320561,0.6692790844408,-6.25805922052089,-7.333870727797772,-3.9461647771270334,-4.075601776680104,-7.125919665851907,-6.559972953286378,-2.191458091471855,-2.47900950616508,-7.089145205713293,-0.026498938075670823,-7.9281285719206585,-6.199883587210809,-4.019387589182825,-6.608859994927671,-7.841785258170087,1.562472637259433,-4.25710082733218,1.5517748557438213,-8.30903801405438,1.3604175123636297,-1.7547548748455808,-6.069026480658791,-7.491364650403424,-8.81284724212061,2.1903385114193976,-4.757575097551998,-8.480340346246589,-3.2692852404069304,-5.978536675033747,5.10652416792571,-3.1150409790040037,0.1833852081960936,-0.4841006666216048,-9.153354256175657,-2.8783847008210977,2.135938498493667,-2.6318173029780363,-7.954874019025889,3.1705313978325065,-0.6338507113664618,-4.401742075487985,-2.496782087026765,1.5423232940635567,-9.616437630769328,-5.902068049739695,-6.809680577270165,-5.217882957690045,1.516106976960965,-8.202218064822427,-1.2764373305086842,-4.953860405149561,4.405894265087944,-7.094678186168322,-1.2467127335365102,0.820525224267592,-4.484414139949016,0.42299942783173244,1.5680032007990983,-4.102509980574076,-3.991808982267604,-1.8359758067819356,3.162555434400624,-1.6679963999205139,1.4389076543866515,1.9997830872214144,-7.814346994289863,-1.923276697981346,-2.808024500926333,-6.799456224494872,-0.002917930997945388,-5.245916890758917,-4.5986337545712495,-0.011272166798116743,-0.1958506489032272,-4.179275143399746,-7.0420594725461925,3.6557260653643207,-2.031107522870222,-4.884070352169822,0.022140766696273354,-4.134686763562172,-5.0013990391519325,-3.769876028439536,-4.840081772938639,-2.85473428230842,-5.4187493510485405,3.6738160638185726,1.0007744982351656,-6.5204706891233775,-6.126243944619779,-5.820241937780138,-6.392399899377588,-5.797827334138605,3.5760426890477928,-5.205406412550147,2.3187111585234064,-5.726820216781581,-8.953273870429037,-6.421439746718035,-0.4278572675987994,-6.955508417234327,-2.1931067594959264,0.08272250044747699,-0.8280254038063563,-8.60681628524337,2.587197699714343,-2.031574714237723,-7.2303804351729015,-7.309910680248347,-6.3765346346246075,-6.198403516766029,-2.222456656210509,1.6078783767007672,2.4854531477017106,3.7008685725727966,0.035993398277708546,-3.100700621799195,-5.859527056064145,-3.1789395711905257,-4.468793742534595,-3.7725276447467797,-7.7995841605072185,-6.992858255806837,-5.2163629174872765,-2.0539827621736615,4.6215251695689155,-8.904888952654954,-5.927842232084416,-0.6810218399662882,-7.265840827943359,1.1572017397595913,-5.72147635401802,-0.669704981024794,-0.5720005230719424,-6.517681735927393,2.1619074680129673,-6.989551782328149,2.721594541345684,-6.650643116383519,-8.224743035980744,4.726920592465324,-2.5270533241013076,-4.959041518269876,-3.323812465141569,-8.997528250413078,-5.675222326867128,-5.329575441832521,-9.118254086132056,1.972159581671275,-8.786220323560528,-1.319115078667736,-4.686907525837775,1.4655744740946943,-8.663999147820274,3.495409603297016,2.569460931532669,-6.391365005590168,-5.608032814400443,-7.622592984959402,2.4694257427585447,-5.633353551398527,-3.434513385769271,1.3326489258045644,0.19533947285197983,-6.7471961549073525,-5.266107420257081,0.25267316068607515,-0.002992494833828318,3.475737275736029,-2.234808289688837,-8.288268145549928,-1.5703387786829486,-8.935991986512656,-8.775575480971103,-6.878157871271811,2.8904043022631902,4.822598883096215,-5.322811064369328,3.1011111347746527,-9.02555777395191,-7.4755849546464175,5.066761335062152,1.2047337473224315,0.35673117614944505,-6.189395553367847,-7.152716688591699,-6.768691617805351,-0.5126681637613056,4.07695898553347,-1.9203859446879776,-9.166820572603125,2.592905791692776,-3.835736285350167,-5.095993532058918,-1.2264503254478514,-2.3823050631813905,-3.8973322710668303,2.3367853596681027,-6.631688677001236,-8.169965400842,0.9206033516958656,0.2771194836913206,-5.064592717148883,-6.38831713179212,0.1496692055378028,3.7312497992727165,-5.3715074231889535,-4.380455046475589,-1.2792219071691602,-6.363763229491694,1.4836687153198163,-1.6751673383603223,-6.03905831649129,0.35644339482136067,-5.127278669603055,-7.339444566495419,0.1922403068763326,-3.212057968290096,-8.532290289351112,-0.5820278497989478,-8.134692060282088,-8.1579851867222,-7.771077735654553,1.0131879364678167,-6.589157338145451,0.3310722838430665,-9.624418079088281,4.485743317950421,5.542169491318369,3.041620187829905,-3.8246610982368696,-8.243079770122613,-0.5215589063028858,-9.047224526882294,1.8587284106921385,5.338911064570553,0.7563233654081556,-4.278161296932015,2.882188198173629,-8.456713212369367,-8.12570738676637,-1.8443009922089724,-7.096834457649432,-0.7434818255661958,-4.277456082499509,-0.9590344099439767,-6.2664137643015945,0.7864559184526579,-8.505580004479528,2.609429131644039,-2.7799260492943483,-4.94196344853848,-8.478199893563957,-6.284947220230264,-0.7013259952170183,-9.600016508036456,-7.532659165822665,-2.9940519492496014,1.7414229519149975,3.213988658850091,-4.21289271691523,-4.762126202279445,-7.213361782694706,-5.681701368727891,-5.626785550596987,-6.601613527835134,3.5621587864945616,-8.82648264108024,-8.029787111267897,-3.170458445803797,-8.297600677825136,-1.4860445694864497,-4.544630604428146,-7.509444994170669,-5.445884989516458,-8.677137565172032,-1.4167307239145355,-5.027945194890348,-4.216805514921015,-5.125583762660246,-0.6220430340428021,-0.09898876052053307,-2.1507130213090666,-7.3109920137944115,-6.184294945019428,-8.510824458650129,-8.994839138225386,-6.434082748824239,4.960104719762162,-8.51473828512122,-3.441503154731547,-7.146621479551222,-8.493630418144022,-7.089179787366073,-6.4169957587092545,-6.740514596974501,-4.994313492277539,-5.420230095765641,-2.8833203145671398,-3.2283979158384657,4.728406255399597,0.9516842339067527,-2.469228086158172,-1.523255807842625,-2.2756757385210697,-6.604983696362227,-5.928428392402405,-5.98828672338693,-5.2015547916935105,-0.8758782437815483,1.8518469908721853,-4.232597954197069,-8.26353144237434,-1.9251565967859707,1.5311051196411718,-7.189924721476594,0.0825326085393958,-2.6935099088202903,1.5326379990355072,-0.4313102315335485,-1.4702274181945703,-0.8320796434989634,-9.234251478090645,-0.3302899147990753,-7.740913070980806,-2.161547148143692,-3.7389988880255123,-3.3891997887733716,-1.3492033270338568,1.6523905384581425,-5.794164670895391,-5.58311623004209,-4.959745658141502,0.13187973294114194,-6.1522297300824595,4.41562346748435,0.9036176657496643,2.244520192748582,-2.8166003542436484,-1.4506824766467024,-0.013372987321028132,-5.040772752485026,-5.215356390395008,-2.28116172605617,-6.737330959982725,-0.318814384891039,-1.420995559864349,-5.037954806755164,2.432779371090281,-7.8310490787547025,-1.3504225273766592,-0.16765547835238867,0.4114074907271734,-7.103020089084728,3.4713974886621894,1.2691001445647137,-6.129272117150738,-8.373628919785036,-5.213594563011152,-4.944728289076611,-6.825971110168497,-8.650990684287063,-7.355726903541307,-5.014129513953315,2.4900755946677227,-2.1229711584491002,-7.079175879466851,-8.193914208288357,-3.39791948563095,1.6543009427251956,-9.096453633804407,1.5735700184741297,-6.602195979010552,-2.8623103441994098,-5.2513740156087625,-5.941883914009147,5.550346714056303,-0.31899010864204785,-7.917647368972228,-7.9887783839299855,0.4278040188912143,2.3842628120267673,-6.45118834196237,-1.041272413682656,0.4896120804008129,2.0932838652929804,-2.2981848871203265,-7.073695876191752,-7.0267100438847026,-7.414964986814208,-5.053890884443119,-0.6112497636973052,-6.280622900039902,2.69664974245693,-8.888862357564445,-2.1045770398221495,-5.527827066016821,-3.116295133987495,-8.634635218157733,-3.8098889811147463,-7.581467851004551,0.7986525707187987,-5.4957107705433215,-0.8104504959865647,-9.671499852010072,1.5696571580438219,-2.680514951363752,-8.167688325300515,-6.976189534803548,-8.811407452015347,-1.328833421893127,-9.29283420874899,-1.217597675834831,5.1721776047000985,-4.509978693335989,-8.472731755390244,0.9502578162522104,4.132600916750108,2.2877536695013068,-3.6692759849307373,0.7329527887124896,-5.324671010158604,-0.8119396758507298,4.946643222636792,3.7188587293650612,-8.582530854921352,-8.759876170826569,-4.475800404379049,-0.0628556655408455,-6.755872360327826,-7.6286302453076,-7.353013990465616,-5.213540722883695,-3.8320194203944813,-4.3259775662949576,-2.4502525759613807,-6.547783663018256,-4.3764942248983925,-6.14121129504329,-6.03437332477105,-2.3388052657639764,2.7116190274089615,3.2289855096233873,-1.6985920537440045,-5.720488101095561,-8.890675432359968,-4.32073531072455,-7.342225824422064,-5.1813731768648585,0.1466899008339263,1.4573339533629244,-7.743496111821898,-6.040136752027731,-0.6145903739242691,-8.765425065421919,-3.3417415876620047,-8.532498560469394,-7.592548995065895,-5.335620129067792,0.6014531471460044,3.0564757168179266,-8.875331648186132,1.3729838134108518,4.099046588428177,-7.519224843224556,-8.641602181894502,-6.449196420481183,-6.070011167738137,4.045589174469888,-6.733155554496306,-6.999079090058591,-5.412318768286527,4.274846149015652,-6.574852997909821,0.7737733195752355,-2.5670781039420207,-9.898440249970255,-5.330746245333812,-3.990271858012523,1.7369706212481404,-6.618896955806047,3.1583785944415963,-4.658169055853079,-5.057206243479957,-9.284273257647763,-5.279941037191841,-2.3052275561039077,-6.182668990384284,-6.6153142239330744,-5.302291404841015,-7.266347341782169,4.066754870833379,-8.138511445566714,-0.8593859244483523,-0.21789179562927746,-9.742149759863903,-2.2765316908682536,-5.660118360725258,-0.5888774450916727,-8.448770531477098,-1.5280378104125385,-0.5012631839608409,-6.124020035359035,-2.3084657539114564,-3.458689269840639,-1.24244371567657,-7.390668077870936,-0.11316219609323924,-5.188678025380409,4.414269834508999],"y":[-1.932034649673132,2.63787810215682,1.7145654120083034,-0.5858123882760521,-2.96594188550327,-0.3660950643252371,-1.3634839547916227,-2.0414727871228693,-4.194607108991353,-2.9275266126625024,-0.5087438788682305,0.8074274911415597,0.12362524780206782,-0.7182546793565819,-1.843185385034509,-2.5940244546280953,-1.0216253428506399,-3.2401023331744576,-2.432374658175831,-0.038010598005882884,-1.7880127339582794,-1.0671461799289697,-1.8316469377384659,-0.7033332143115408,-1.8316471638474592,0.12301641599355147,-0.4847562368286296,-2.4726374492502337,-1.9719003740912762,-0.843054642649188,-1.5510638629251396,-2.928445031726371,2.5327719889045404,-0.8006524458126434,0.5120083266449382,-1.1110521826202815,1.6554945790885285,-1.3729001271086823,-2.1220371877712583,-6.200252778133894,-1.3738723720537958,0.696442617209629,-1.8086516040260843,-4.055335332370561,-2.579867335810428,-2.9774095009338093,1.9235517434232035,0.6155903866051392,-0.9098447057629835,-1.030621748170929,-3.913627673966249,-1.277790309401994,0.18419279258361002,0.27587864765998327,-2.302320815314754,0.7839898920318248,-0.8236530721182785,-0.7324944419060787,0.3677812165860423,-0.033030316664381734,-3.7985048851943883,-1.8706568850148382,-2.293172273033482,-2.2438638826310813,0.5841025561791083,-3.6573783693275317,-0.09895548342520266,-1.8654094616782777,1.6418187352105442,-1.6656557416400606,-0.6955973142922001,-0.13693066476577392,-2.364449626744165,0.610734310403802,-0.42009069952322825,-3.1669758584888186,-1.328048080210479,0.8077739982149889,1.6345218274782947,1.0012047782202247,1.5158823237647947,2.1397890824401617,-2.891923489485791,-1.0331428649110046,-0.17421927852890706,-2.7336885660230092,1.2296697939868293,-1.3710731425783869,-2.258425518306559,-1.3721327345066179,0.30996305872386326,-1.5361911087521933,-2.8771094018911025,1.224719794765802,-3.4030515671267656,-1.4881310601661975,-0.12186982034540263,-0.8204942640398772,-4.0062073247756755,-1.8175816627270935,-0.9759360404607588,-0.150673083579455,-2.3255953747986147,2.1069740501969085,-0.43983176570489446,-2.635118199022566,-2.4646628912363804,-1.5032996922194806,-1.5427247813731726,-2.2913848715702856,1.5791448165548234,-2.687627136574012,1.6247694542612254,0.404637606611793,-3.0417146802986417,-1.3793945757072243,-1.0310916728940216,-2.3704560568827424,0.3407200967802306,-0.5819385214850921,1.024154450675754,-0.4138327804752539,1.4152076631456365,-2.20296334509306,-1.3619466640314986,-0.8544881926249299,-2.4686730748426835,-2.359712017744853,-0.5946137249726237,0.34360122473664206,0.37061195900548055,-1.6595069228731878,-0.5224261455948103,-2.087926022797232,-2.404231160455064,-1.143803220461452,-3.854822334024689,-1.322367412178212,-1.8490271517857622,-1.8409986357393084,0.15950398811995176,-1.2262194177743386,1.8702510463380808,-1.143908519980109,-2.7616437605572237,-1.885049598571921,-1.1396090401690429,1.3915847102601258,-1.6194609756192817,0.9565542662713304,0.25741052872452064,-2.609867820466934,0.47383086959750575,-3.4562074429938274,0.9348302605446239,-3.151576061523303,-1.8488205122323425,0.9687805801507989,-2.1579011424924244,-0.7801721883567987,-2.0511229159017157,-1.490508670323169,-1.3088794162529005,-2.4868951448339196,-3.5808924113736422,-1.3316830666256785,-3.4307052000346507,-0.34899640999140225,-2.1236117794156537,2.066319869459924,-2.7904513397372237,3.851837070942947,1.6598255144587952,-2.7411513539953436,-0.41439409838631447,-3.337378701195922,-2.534735710899819,-2.084457677796512,-1.571038694307066,2.77758477297436,0.5210756071893252,-1.5522555412152295,-2.9479255892247624,-1.3506043250481496,0.2946260748346268,1.4507436954408368,-0.4081161592682057,-1.8206888798263696,0.8686062372717795,-2.6241313702169715,-2.4720090804564827,-3.308244488044102,-0.20972031951281497,2.0441465628393867,-2.99363474901888,-1.0697112045452872,-3.5741792207634786,-4.251428772703586,2.8387136126000154,3.0092667149465875,-1.1800116447105613,-1.793058929307407,-1.882150481325253,-1.999091810136596,0.9348613682625613,0.8684290740907475,-2.6106904128099315,-3.9454179426362317,1.0617088672211843,-1.9131384972766732,1.403219829583742,0.6956349447139791,-5.127741297006438,-2.4049054426510033,1.8828388845698778,-3.1157968826907814,-2.1744700684322895,0.028927960680334434,-0.5067008758408905,-4.115383184527196,-3.0124429168240265,-0.5873766168462514,1.1928376507460428,-2.633105079012401,-2.8225675046518117,-0.6828825011628323,-2.6189287835955426,0.06585559661881947,-0.7426991937382033,-1.092445473921606,-1.3552838614044325,-0.6283855372598393,-2.138531996106131,0.5432571764440618,-0.31261117995907245,-2.6998232189577935,0.23668672002892419,-2.891231989029636,-2.498475190848116,-2.2344435352394765,-0.060451538170029136,-2.4233952052892707,-2.6363367284310217,-2.5572658828117034,1.255152248091116,2.9417420593764096,1.498256344700332,-0.6544797067898595,-2.6375447223700306,1.5242052404104383,-3.08317140265896,0.675528009255564,1.4296801367967344,-0.8165199663986397,-1.9107575382296516,-0.5818939716333471,-3.0280778155853967,-3.21518761326382,0.14321503915729306,-1.5011476664057999,-0.7094796521990709,-1.6638021052129923,-0.9067848478968218,-3.267512892279747,0.3420770420089547,-2.8879545995507794,0.972678292721811,3.215097348326008,-1.2672705861957902,-4.026413359677552,-2.4244428634571014,-0.15032299645616587,-3.356726012000393,0.7121960516327087,-3.8319567836414667,0.8509139892390967,1.124658952575175,-1.487625167612027,-2.2638724300362623,-3.081087043382027,-1.7137072602981267,-1.9414422877297968,-0.39184830976747315,2.9159084806552897,-3.374983812961973,-1.875959132561824,-0.30322061863412747,-2.6079415683925196,1.1073961008914242,-1.587772889825558,-4.196477314076773,0.10146014955197667,-4.27535427535894,-0.07583743362158604,-0.9155220456633233,-2.3065370020778446,-1.2849711936921047,0.039141963243481036,-2.832055143637398,-0.21442946333685262,-4.195566548813597,-2.2691178205011773,-3.6374603251382425,-2.6652633666285044,0.4393076706183015,0.41465422051492706,-2.8947183287548097,0.6815284423824164,-3.1937549375294316,-3.515200081513358,-2.6262205999977493,-0.4122318317361143,-2.0798380088960506,-1.8559973249131294,-1.173881615995501,-0.7835713696147496,-3.382415473454142,1.7632041859164047,-0.3629959399621343,-2.184044130122969,-1.8261348070108487,-1.6520203632299055,-0.9766780404867582,-2.9542628667890116,-2.798954249218691,-0.263493263750899,0.9600190911801584,-0.0789791416872926,-0.21597124431079964,-0.38572805917128977,-0.7448702709422159,2.3959577928768896,-1.6926011248912811,0.21253145207308363,-3.589177478299889,1.4446655511832673,-0.4379186161565297,-0.055203487042598046,-1.2299801029824204,-3.0638681314016325,-1.52533978075884,-3.933741009998629,-0.424409126579634,-2.5655196474461404,-1.5966308392581732,-0.18949508907913945,-0.3079325256214491,-2.2969056384131137,-1.330316203643311,-1.4677467487257532,-0.8908763991650591,-3.3360164287042378,1.7474984908980813,-0.3316480399823101,-0.0037949214173630683,-1.8043024252624784,-0.7534588220753857,0.04934915631534749,0.9120481515890482,-1.6192073482867304,-1.7486023396241803,-3.0460270252039976,-2.1593850555456386,1.2807265785433217,-0.05734636971484656,-0.10664519620348502,-3.6545225851260494,0.9789348535871795,-0.5459534592516174,-1.7853922177421147,-2.2315710175727097,0.3213182205607132,1.872254262329574,-1.4925937234540334,-2.900586371194109,0.4783432083593933,-2.810392996855064,-2.355824062537062,-4.841922899026815,-4.6253618954399895,-2.72477424032508,0.3747328707147983,-2.2142844380166715,-1.5622744431521252,-4.572873142632471,1.088110730632961,-0.41957463752107027,-4.145408511616024,-0.18425746670504692,-1.471592047529112,-0.6464979429105541,-1.595568277198815,-0.009202515491919956,3.0338891851244356,0.12498239142782638,-2.332510343497818,-2.6251054277545682,0.8334176611687232,1.3313013399758948,-1.421139424273529,-0.3617910325424916,-0.6336634707835109,1.2991926043706998,-0.8436602762803879,-2.275499877854871,-2.3106899446449853,-1.88630025796378,-1.7552324281521887,0.2285749126785537,-2.405774956637611,0.3775715241600902,-3.4059678282793975,-0.6611587385087199,-1.3925266086057633,-1.6385547239306664,-2.520756191560631,-1.930040334552027,-0.6676176964541682,0.3580315151592968,-1.2094729338516124,-0.48703808361580786,-4.575372006889814,-0.4853697655494634,-1.107361903565597,0.902482808288564,-2.779084416763995,-3.0561297020987332,-1.00557710458097,-4.358855770593979,-0.726656384633075,0.18977171500642648,-2.643772485411723,-2.91154437680787,-0.463077511859917,2.1275465889620526,0.1943347784467001,-3.1484651330749838,0.8130176315040673,4.090374528095014,-0.5844181871107311,0.999181796214033,-0.3378553375631257,-1.2993039615516226,-2.9080341147149436,-3.758009911891845,-0.5012257036025018,-2.600789851747283,-2.4897911961914336,-2.0421370723356285,-0.7369815158218116,0.5036594224886107,-0.8360375104538386,-1.2693628131360322,-4.591733277052682,-0.6927052896704522,-2.0800644033167863,-1.6056985859915884,-0.724522599951376,0.8462368420791527,-0.17232290990011193,-1.2920595656225162,-2.4798979835298156,-2.5550851016189586,-0.48541539865113337,-3.9306983007393144,1.1526511857111643,-0.6329838378228697,2.0400205938759006,-3.207431936921183,-1.24998722985413,-0.5411894928968956,-2.7183360019948677,-0.9612094384590848,-3.6666452746149285,-2.012198591492316,-3.2860285012503287,0.3654131609185058,0.21287938418132896,-3.4296396228150896,-0.610244934061351,0.8488335381907628,-2.172681035977182,-2.7883150835913857,-2.459343703368051,-2.6654383565466375,2.536014280461452,-3.603905064746669,-3.358009201822652,-1.4605900952191266,1.3787162712182146,-3.2135073091456503,-0.08231829179452733,0.1856025079667426,-1.6108422468342296,-2.5374181739952038,0.29343134531558884,3.5234338987858425,-2.38593082637557,1.5050978625013165,-3.3911148129991284,-4.455015926187657,-0.022463212392527646,-2.9688149058130144,-0.33579538210336535,-1.8383759005712994,-2.141525763216575,-2.962577426033861,-0.4367783683135537,2.3598905781861843,-2.137946214096946,-1.7330252854900485,-0.8353133045190961,-4.458162446746561,-0.819164979799242,-1.6266135325581754,-1.662555081286973,-1.228696245890221,-1.8244016803591518,0.5307968035997369,-1.755759735511391,-1.292801613213605,-1.062812181425807,-0.9046279368641833,-1.859162918738417,-1.073231296954031,0.4494556011961508,1.4209169055117952],"z":[-2.4342745935922148,0.03361468660681193,-2.1192026878005916,-2.26413167474785,-3.2347099103296033,-2.8092400253803484,-3.132502285309965,-3.534881031498692,-1.6278041573162945,-4.019884392649665,-2.7542868990564573,-2.692233648247789,-5.27493576646966,-1.854784462191379,-2.4165218871109255,-2.1816190739064796,-0.7741743188163973,-1.1316317170882007,-2.6909429568550087,-1.5598073096306244,-2.967429798989071,-2.199202185864522,-2.719061576516538,-2.443193870546313,-1.9123235549325164,0.4019544083189212,-3.1994401719874888,-1.6085985996136196,-1.4883355451637048,-2.5316113395545514,-0.7661909999425378,-2.5695613050983703,-2.326851765202256,-2.2885000361223855,-3.437609460491704,-0.8703131882813895,-3.36639504349343,-3.245491451590512,-0.8807292008902363,-0.5245604396167719,-1.9858100700104877,-2.0395168749103894,-2.1192295718940444,-2.3110543517379116,-0.8227703644343953,-2.711977747790959,-1.8951628212464966,-2.1131887768221223,-3.3286829185021314,-3.444397466025789,1.6491073387910928,-2.716844101385527,-3.2019397409451447,-1.5102698456053536,-2.7481785928733324,-3.69803722332048,-3.1930459430361666,-1.7621274577922075,-2.1611654795512893,-1.9727508870783783,-1.6725713052404882,-1.8644784346361245,-1.3408171771182362,-2.428486133648964,-2.8663014995009646,-0.05139508519576436,-1.6863769743111914,-1.5610960049040679,-3.042398471978335,-2.740582562450851,-1.9915164295142176,-3.588529581681617,-2.0778384493094504,-2.5121588907344674,-2.8436358624075013,-2.723373634545515,-3.0851346063502576,-1.6297554876135294,-2.459594057053233,-0.8432251918110046,-4.1843497220763775,-2.2386031899151906,-2.859396934216313,-1.0509927451051428,-3.430023446310235,-1.8239575531866417,-1.4097236948131688,-1.6462256886079538,-1.7029074962258768,-2.4994106631921484,-2.7616918683893052,-2.3286066713116647,-2.4420817020314667,-2.0296772653081794,-2.7412706870606285,-1.354229253629726,-2.440453443676985,-2.662002042965464,-0.8251357085818751,-2.0999495116398244,-1.283870560140057,-2.380650800577777,-3.3487262246445066,-2.352528748866803,-3.6962911510379852,-1.6912807152147433,-1.55804053013399,-1.2830470140475798,-1.9684076865728073,-3.0899175052242396,-3.075467264406391,-0.38869937372261937,-3.6584467593169125,-1.398055203676694,-0.048699648525465586,-1.1871373655674105,-3.5348162859935788,-1.8600073248921616,-1.988656882622719,-1.7249748227885287,-2.569042174296668,-0.1834303891746302,-2.519487305037183,-2.7200768974485032,-1.9726237760320948,-2.1026913492625208,-2.186397535684863,-2.5568459635978535,-2.090627316834233,-3.2323400914781346,-2.068330721804148,-5.034241919565487,-3.294496416534399,-3.13676652428812,-1.9914454074861714,-1.365388988582391,-3.7137088167368195,-1.929164934884782,-2.3119652337394094,-2.0434957937029865,-1.2851724033738507,-2.66733958502191,-2.8694231608210083,0.48344632919538805,-2.505713307450145,-0.8526540064014771,-2.06663539646109,-3.278568935666188,-2.89833839567423,-1.7202069988582152,-2.4028280379756657,-1.365163182723864,-2.2094196381650337,-2.8207215495974918,-2.8590363351077754,-2.6184595775615658,-1.8447928823527016,-3.7771020122030183,-3.8087900653811864,-1.37456717732769,-3.10632359866684,-2.3679691549825996,-2.1508352720710304,-2.228248750012633,0.974615236165005,-2.462718146965957,-1.0038797876182093,-2.663327958751388,-2.4791224821225097,-2.0729314384189728,-1.7305567555231542,-1.0981917595785016,-2.7989545384510635,-2.4456687877961785,-1.4207143190714602,-2.789924054985225,-5.430343139505603,-3.362848957266381,-2.3671783206458836,-0.7103608739312139,-2.517759686166994,-1.8306669502807662,-2.9967976162472025,-3.163623266291328,-3.6585415531729,-2.7425611375923395,-3.2329250529989784,-1.7860117696838331,-0.2356946856230239,-2.6571680665324413,-2.5982908907921876,-2.9721850530876215,-2.8927160512739416,-2.55110222637448,-1.1353420805931405,-3.4449067142859744,0.3709087602789947,-2.75477968458831,-1.6271104745985903,-2.049490004376491,-0.5272703180664136,-0.5761524734292806,-0.4020450093427629,-2.0916263294579966,-1.330243967841302,-3.538752251872538,-3.4325516887300744,-2.5345094389731173,-3.480969400115885,-2.79147060263134,0.43660200553539374,-2.8467663977734703,-2.461502841016931,-2.7796620391513565,-2.6996027130278017,-1.9927426637364543,-2.102660579579283,-1.4013515472164055,-3.0718877844307144,0.6110561383106359,-1.5426367810190926,-3.311016770694466,-2.213573587985701,-2.5386457335613306,-3.8388590308726527,-2.8058476527800833,-2.326282737931233,-3.141782199351221,-1.9476949203335752,-2.653475976307605,-2.47153337996669,-1.75838787834641,-2.2634315706524295,-2.0347547839724225,-1.0141038582139583,-0.13820834810624472,-3.1175894972409623,-2.098003453376883,-1.9001898865723998,-2.8006935102873847,-3.2480282071598183,-1.4580364890451563,-6.587774584819218,-1.6385822555193044,-3.3194473722171476,-2.560176970287917,-2.8234955756210374,-1.0312784604895362,-1.021995082720426,-2.027299607608981,-1.7373178462499241,-2.9283525729994646,-2.9448914887526123,-3.7896825706747133,-2.864697483976204,-3.5874469647389255,-1.795854064907722,-1.538932280909677,-0.9638491494757302,-2.314860474224462,-2.705554033725172,-4.808098254069183,-2.9390915233040285,-2.943355261792113,-3.359881052516532,0.34118531402961866,-1.9935615587220006,1.4741215028095935,-2.0721335284382394,-2.3593627185939745,-2.477983031383716,-3.622359595430806,0.6103753033313366,-0.7435567504615263,-3.8313309048635213,-3.297807561376474,-3.6295573425048646,-3.1766269733112638,-1.2837004202762718,-3.158112128063007,-2.0786129435708345,-3.4062446257238865,-1.0705720704581392,-2.5488115222795815,-2.4773326383866365,-1.967245911792821,-2.564684381608565,-2.077674633097109,-4.102343728596163,-3.9387723555566354,-2.369080204145592,-1.9685068517422515,-0.992977532239639,-2.1533207366043485,-2.003700713607665,-3.285961339543102,-2.3090216663491474,-2.2771308001133646,-4.009820615092518,-2.517928388234942,1.1620029039310273,-2.935545372842661,-2.227311882104598,-2.559162249218331,-0.5843803629111085,-3.7446919681977717,-0.33813779612179024,-1.0879650445073907,-3.5471099821912797,-2.130880078006503,-3.0511497745404705,-2.352326057644825,-2.3517536058408886,-3.9866110906975143,-2.168339169786093,-1.9487903822429842,-2.8676789074506526,-3.028812386519383,-3.4601334113684854,-4.396512612497764,-1.9985500654392416,-1.9424271718799395,0.6824939599665766,-2.167141116113252,-1.9270956722738413,-1.2132228968657195,-1.957306703697395,-4.113339245684278,-2.1841304013589697,-0.9493442220263895,-2.439691120513327,-2.1069911113229947,-2.3237749058374177,-3.0732416714524233,-1.4312904097472483,-2.5291407983905874,-3.0496111110779736,-2.1234067439010658,-3.255786067082448,-2.115961613331906,-3.175740039007148,-2.8524758541050774,-2.566333150223207,-3.2340862595810793,-1.7034588048874548,-2.295343743000745,-1.3911495756538126,-3.3636402240175847,-1.9468117502508275,-1.9859269935645605,-2.2955816344107665,-2.0729065362073347,-2.6885470941186678,-3.3888962702465917,-3.2582210781301555,-2.7774469912920527,-2.4134859701205293,-2.179240335032265,-0.6930875293324708,-1.7184720530669668,-2.096900615862529,0.7994254358123511,-0.6033687686225524,-2.529040547591426,-1.3598941237754467,-3.1844879822892795,-0.22752912214166585,-1.421876277284067,-2.2925607611891525,-0.9071321114668013,-1.8209358355705638,-2.824661557837584,-2.547944131468139,-0.4621063273984179,-1.9632704547387207,-0.9802231197923681,-1.6490896161102528,-3.0738424037574457,-0.029480479474304207,-3.4524929476998225,-4.139631915702689,-3.707284780136915,-2.4708374754241595,-1.8345397908659637,0.7132698507460989,-1.4436309948379438,-3.098549206654517,-2.1990109174567234,-3.142181988839924,-0.42738038794935673,-2.1303443937282993,-3.153009515864296,0.1437105098279707,-2.375668577291976,-2.817161719828693,-1.689461546388354,-1.8932082114759528,-2.6014709883994085,-3.3279316764167697,-2.6428739236743954,-1.7062070558679854,-3.4728789565894727,-3.941758239088505,-2.4404284013497377,-2.7366034704335633,-1.096802627776567,-0.5341870079493145,-2.139286180682158,-2.5865852344068627,-2.264109294684517,-3.112755345628893,0.650646360070891,-3.05979276033365,-1.082712706015812,-5.841269933598096,-1.7198204862281172,-0.8890054969240067,-1.1935506883627558,-3.0197122143488926,-2.8887232743595073,-3.3425553859089807,-2.046494289615753,-2.909071941113931,-2.6615129500073254,0.7695254894436825,-1.9154556691708589,-1.1824757970995896,-2.950122351702488,-1.8606873908142028,-2.9618495095344572,-4.282246772615577,-3.4426948350899313,-1.958171661769897,-2.61611018984203,-2.3294674851661545,-3.09880094064258,-3.711431016047635,-2.470386625857141,1.4621527933768514,-2.8863378767956163,-2.1382366338172405,-3.4953759563607583,-2.2911738363375678,2.6668937287969907,-5.010598533248727,-2.802859735357564,-0.7005819056809326,-1.9465542796769237,-1.6903612012806333,-2.3885549041575587,-1.58816224148475,-2.13205333187653,-2.273718582890076,0.14359816253403343,-3.0113439323107736,-0.9340846167603881,-2.6932233629547997,-3.407710735558956,-3.6301516879411047,-3.513287952161657,-3.1777093289958254,-2.5279861575519047,-1.162006366133077,-2.6580822174319256,-2.389353562722184,-0.1869829283877735,-2.848480155154473,-1.5529597277612421,-1.5942436650764478,-1.7301822427576372,-3.0478389115024327,-0.09538613027875725,-2.3932479221000658,-1.9544840570021655,-2.8527760536794546,-2.990684383299979,-3.4038699148270486,-2.641975104168387,0.588988283072125,-4.301868555276012,-2.8903992748061684,-2.7886259181798874,-0.5505032723008235,-2.5501134035474875,-2.278071580252171,-2.4440846182372495,-2.1060149960810137,-0.5008091698325847,-2.933517856517283,-2.9935623954850348,-2.352653471639695,-3.0474515123421546,-2.5518628452527468,-1.3454048339015527,-1.0416582235259222,-1.0419137276635042,1.7618396241103629,-2.518830840993669,-3.3676588092213215,-0.9305274019081079,-3.990210228939675,-0.5943975578960543,-1.7907034429733475,-2.5823326946127647,-3.0199607227898686,-1.5516206245474125,-2.9255654438533276,-0.25215340485139215,-1.438618431039275,-2.541237245904329,-3.4394726398498583,-3.192590084024557,-0.6874433217812517,-2.667904174838389,-2.933233205387476,-1.6431580939786903,-1.390016172181564,-3.028246702425884,-2.335360655190152,-2.213028481710765,-2.80632213416113,-1.04851190576139,-2.4596480388919773,-1.6528623008181673,-2.832540213426971,-0.43759908686319043,-3.239434270470573],"type":"scatter3d"},{"customdata":[[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1]],"hovertemplate":"class=Trouser\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Trouser","marker":{"color":"#FFA15A","symbol":"circle","size":5},"mode":"markers","name":"Trouser","scene":"scene","showlegend":true,"x":[-1.1314422658184589,-2.32658133318651,-1.934504898775206,-2.0380251354974677,-0.11397929582554851,-0.32273962610109463,0.12384649678520666,-0.06595324095690898,0.7386590177784272,-4.432854805929329,0.7921204640014644,-0.9972824493874358,2.2987517453116495,-0.07918677292235193,-2.2728281249368933,0.37539642943107343,2.2411881092175587,1.7882540996114393,-0.12640294509993755,0.9862334021266013,2.300709532679507,4.335176213492782,1.7941971466488045,1.834847821010267,-0.9846720192130738,-0.7338207941467367,-1.561285767375811,-0.6518327146389333,-0.9687446266993164,1.6214311426844459,1.9077725752082386,-1.2742175657359323,0.24257028577948175,0.22327216160677776,1.0529318747531236,-1.1499358523473926,-1.3461093582281414,-3.035303519410248,-2.1482259409858924,-0.5435708781590262,2.0741421912073053,-0.5620706518631153,-1.5178738995370809,-0.7726130233963431,0.07458804560965224,-2.5292043434841815,0.057383681806220034,0.8491212478086024,1.5612117946318815,1.1862298046776873,1.8570329933180336,2.597264702893717,0.4949466346301756,0.5488291434656133,-0.6374335946816225,-1.1198411382666804,-0.882457089213143,1.5618875286893412,1.6717734802331876,1.3899191632163876,2.2717671022672628,-1.0139327846489126,-2.357203777647162,1.9481979860662821,0.28704665990489203,2.00922633426432,1.532966076844097,-2.357232613322562,-7.7509994120276895,0.14584348025230567,0.8185882565810161,0.44494440086735493,1.1072612478475807,-0.9543232668561866,-0.7854621493377903,-2.1945986225205334,-0.34979246333733016,-0.3213554258939879,2.71850623109674,-1.2332850981652816,1.1649916810242518,2.2384531055116828,0.501824266832978,2.7589168786331455,-1.5911388861142917,3.5339105503881254,-0.4380849571748509,3.8826869438743854,-0.01866802354540426,3.521359484543393,0.36511691540031305,-0.2509271175605704,-3.7352606861274524,3.2080078706210444,0.7404520724792478,-1.5339957615519175,-0.7331558799626665,-2.2260020437239474,-0.07289355300746439,1.8472829296118607,0.3247875787439206,0.7975310183715042,-2.998539500800612,1.0179701481034618,0.6432754143843017,-1.4748777392902304,1.0988839872488039,-0.5072518008414402,-0.19856159224916362,-0.5468945043662756,-0.1882136794079633,0.06452763660404502,0.25016923454921364,-0.804235457059431,-1.109647294231059,2.7430103263733643,-1.4873772768204874,0.9466431656822629,1.723092690286206,-5.150806068208851,3.3389736497242035,2.6761107989031125,2.6954670572362454,0.6671261971587235,1.0493632496179601,3.6354249783873556,1.1319771324146681,-0.9919102020349891,0.7104604096822394,0.8333028213110447,-0.43504133019295754,2.3707150809946715,0.8806764165312521,-0.7613168650410348,-0.661387722232974,-0.2588566033289209,0.8156982718769774,0.41014782255312937,4.747805448442082,-0.2526331500688089,-0.3696416434266232,-1.1084384586514622,2.841778913978863,1.655165636818221,-1.5504268675532884,-0.1865039252712791,-2.7004363936440283,1.8564143610801445,0.3467259808817885,-0.3873785837785539,1.3105276384845947,-1.5452448459079557,0.8873591619854535,1.2952614525483848,0.271372864542811,0.9153819266082857,-3.786632689097724,0.06669953204219124,1.6444364625916175,-2.56678781116934,-0.5030913947512377,0.09812304504000327,0.3053684230680434,0.7839108250123918,-1.209266752291507,2.2686371666658394,-0.11905065373704031,-3.2541152125945123,0.4305728098734873,-0.08201390369839502,1.3747617745141818,-0.9534608716095929,0.1611524898346103,1.0523679599550075,-5.328064474463708,-1.5018962434129282,-1.1392861161238423,-4.383676035624253,1.7082047788782553,3.0637846058651106,-0.2865583545957948,1.5031255274762696,-0.22468315444082615,-0.7467193956157006,-0.9275652356904184,-0.6331415466067848,0.09199794868144522,2.0890117707435487,0.7766324302804924,0.42264951394960015,-1.7849986554363122,2.271333697066244,0.835438658035829,0.42488947096457325,-1.881321951567221,-1.2971504544593855,-1.5200496080396102,-3.1649214500335927,0.012154268331792287,-0.12169578732347347,1.9118031125382005,1.4482547905055088,-1.2048070816099303,0.4749347096255908,-0.5207448744913622,3.564146927726671,-2.205379047197768,3.118291438671914,-0.23927308351074567,-0.42806117674351857,0.7762832956787927,-0.010896200202150146,-0.006546548664553813,1.0306703984830394,-0.4861808268658067,0.35120367875001657,0.17299766262374194,1.3661099497547564,-0.18622080015409917,3.3460055170073106,2.563957155697078,-1.2290791886361085,-1.9896014346106303,-3.17129315760655,-0.17780880838398624,-1.264621325954366,-0.9970147532479346,1.3134808481894027,-0.3307710603358124,0.6353645994321518,-2.309034920046107,-1.3453880246684191,0.3736010485692129,1.4390622693179087,-1.1485233357995286,1.352899025985645,1.4098228063111875,-0.913741979647382,-1.6872924032429095,-0.9096049527102432,0.5911197892672437,2.9693600008650423,2.4130000446998476,-1.6579472357270189,1.5821087848885929,-1.267003901804471,1.7109219429390623,0.5730531336865479,-0.39118705430766143,0.9300756931096426,5.071120816320474,-0.23490714954887199,-2.4030180373233065,-1.02493349568009,0.936876732324606,1.228346299362784,-2.0374891712681746,-1.1107118380846703,-2.8597826623710048,-0.03209549158319562,-0.4731696881661507,0.24723590014679048,-0.6538384344255777,0.7998365183645287,-1.341097511389081,0.611835571365845,0.26816265590305705,-1.6790133251505126,-0.8537255067421612,1.5485123648013208,-1.0447655105818783,2.989997470020636,-0.22146502671854013,0.05253964462276806,-0.1314926169361935,-5.137877468467174,-0.615532737404267,0.4788236218885948,1.7950206816445686,-2.2458392166698973,0.218246327831062,0.3330927544390014,0.09098150307136403,1.6612815844127562,0.04723980016144993,1.594260711656224,-0.37422334677493735,-0.4830129411785829,3.033490972554576,-0.29407534847060407,-2.1797804173426227,4.142170237778353,-0.5278075011022246,1.8692913321606401,-0.23852760595376735,3.6741897713759637,-1.7539395923823557,-4.35627250805176,3.810516707424811,-2.917886711048329,-0.03646368949000251,0.7644472182739058,1.206405290921052,-0.2624499383115073,-0.015024019923478892,1.7050056470339574,-0.09364553801600174,-1.7859635932169338,1.2433810646464327,-2.3552092764659016,-1.6709945338960204,-1.5260882393887072,0.8094719285136903,-0.14627538621140143,0.48794478130215985,-1.5586376651788973,-1.4702318891694048,-0.3642981853109141,0.43851326317866557,-2.067382494003981,2.802498778027956,0.7540500733328895,2.5774001888194125,-1.2257698126761494,0.6749566517061902,-2.076262721008525,-0.005263273644881929,-0.9061532448648687,-1.450848853165231,-0.7946915016368185,2.9317255702303897,-1.3841457508270736,-0.6563347478262842,1.649513283484621,-0.46767626733837614,-1.2360333901705691,3.1716281342141,-2.0259396788836996,1.0329960792272839,-1.4869687060530954,-0.5028265395934057,3.3702428322089815,2.3479942472223327,-0.3090182773327196,-0.9196375008803637,0.7510566005870593,3.8680534514875626,-2.09811965108874,1.2598953306434038,-1.8448202568391279,-0.3556532106019256,-3.871907998151153,1.0521558621416431,-1.959361653301398,-0.8229177094036236,-2.133671284908668,1.585263895007849,-2.8828071976607856,0.019283887539406137,1.1051955321491453,0.05256860607761997,3.6820766062568713,-0.5676279050661812,-2.476170008343476,1.8681209898313293,-1.2067787096629738,0.8849909669187418,-4.064560739736804,1.2766663340780426,-2.087661600352045,-1.9396422566457785,0.6177181449611588,2.2391382432490805,2.33262277278264,5.185476945673531,1.3765194078717664,-0.8015265083793847,0.8326048954849042,1.537091756903496,0.42826453380978446,-2.1103151239553397,1.5288766208558597,2.9817623086606777,-2.387999245172291,-3.8157146877429917,-0.3790551733617555,-1.247831600347993,-1.1599382672470482,-1.8827080032129127,2.174560107649224,1.521271794213921,-2.850801986218652,0.8387541109374168,0.9289083872617852,-0.18236399227898276,1.5006593076127597,3.0598500752457096,-1.5044600039066303,1.096371319939935,0.9410410732810452,1.0771041282159952,-0.024418940690752133,2.561122042190174,2.17150632144509,-0.23370928784872605,-1.961048469505138,0.05388418892930408,1.3072732601991846,-1.7534633226842098,-1.926326881934979,-2.0990471057865117,0.5591446014176641,3.16800620003198,0.6922180091709997,1.590743929680528,-3.3941601867289606,1.5135046234043712,0.7089067151889519,-0.5325927920028999,2.012152485434308,-3.494147745951246,-0.5199769147566798,0.1344216850298832,0.6865234540544992,0.005415341563006541,-2.765884490471979,-0.24121092810292774,1.9113322858867652,1.4683429869130078,-0.4073119301491923,1.2766809200969174,-0.9471200021249294,1.0552044649064924,-2.3273949687120217,0.4949628717710658,0.2549897630240768,-2.3621035097635597,0.9714194858612112,1.1151631687601666,-1.5330393212010027,-0.44418798442414237,-0.8293629130430656,0.3245280169828356,1.241063403732252,-1.3216087266894996,0.4180343427827019,0.42112760482339356,1.4931393684871368,1.5046369130247554,-1.0414906315553756,-0.06893278089321746,0.1485950209899696,-1.213090680300651,-1.8874790065246219,-2.2147626466019994,-1.7279415965025757,1.4866087284750136,-0.04070550307338676,2.773555142423252,-1.9367780982896776,-0.05355533226201015,-0.18761372565571116,3.196220995418754,-0.6705075950289919,-5.225736864238528,1.448726527146773,-0.15607667550190785,-1.2585032414746666,-0.9719409136874378,-2.6464056930794135,-0.05013765649876529,-1.259394512975473,-1.481892925000202,-1.1042845070462304,1.9276930339889968,-0.9736064751978155,-0.05690293116711098,0.030553184885042398,0.8254201354628693,-1.233151020756672,5.365096957992381,-1.9056513820608623],"y":[6.275288422814758,5.8013043308255465,5.9219044748696055,4.929266966618788,5.852564584467789,4.859148901785081,5.646501043284649,1.6008029324867572,6.017320305725466,3.324125567814961,5.833844880425318,6.116506598377965,5.092089419067797,6.104089200642642,5.335551206182669,5.988316504770285,4.795565712519282,5.787872006795507,5.77667890645585,5.799376093679792,4.216895352931809,3.0363453821313904,5.258635367701645,5.719011731009394,5.928103358198991,6.3469352724328525,6.2255511570822755,5.114514644305656,6.1759998925082344,5.711417841367313,5.461297206445879,5.2785170261318655,6.002021726546081,4.806072600739125,4.252283410273546,5.047845983234188,6.018424238495556,4.534686199884077,5.26135240289809,6.005676978731947,5.2454872142980555,5.582713354690168,5.034324029205153,6.02142190919294,5.87180438464786,5.500429898549788,5.490440957898429,5.280579085222252,5.378939807330441,5.2264984689814975,5.054708462756898,5.293414492316041,5.298279183072146,6.005799192192602,6.1501406349768635,5.881691854745527,6.3767275086473365,5.817915434381593,5.356731250604391,5.307516398929753,5.239799480423686,6.00713073630018,6.17124545265311,5.132860450550684,5.817130665093755,5.762824608078249,5.293013548356498,6.312425142119002,-3.128811227787425,5.516063123110776,5.9893601205704945,5.705168169157148,5.204324525979894,5.529022641678777,5.106224968469531,6.034484045342086,5.9492653154195025,6.280626809581683,4.776949503609997,6.138393946000675,5.744507203995975,4.994991297397168,5.611203209309267,5.4483538935911735,6.104615432941401,4.054148160262882,5.910909194353666,4.84327782875901,6.102043928281996,4.821099730109909,5.643929951398739,5.704456652795893,1.8788246293128408,5.099762199536275,5.551477668550752,5.625969935509658,5.41484190902607,5.8944979482153625,6.076774721123862,5.387227593250656,2.9481290168170475,5.628149037852086,3.685089766224268,5.9490891525262315,5.672039364640223,6.35480916352976,5.983311886139493,5.936870730107838,5.61051484199427,5.041218865327037,5.9808731140897295,5.322701623637549,6.155690252955733,6.180429935956533,6.454547467839979,4.856582221778893,6.273988709944088,6.105986384915833,5.74144849799865,3.3130962986422126,4.617466942087783,5.4374248667243315,4.613650704781486,5.547201506753818,5.716681878968973,4.281249789566491,5.6868771945732375,5.314724700118271,5.609753459770623,5.520253299426499,5.376506978504035,5.005473085834534,4.8070466980588735,5.714245239042354,6.520065489584796,5.966630157617716,6.0577165785214335,6.01451155318112,4.326515410066597,5.958533132961737,5.951003143038639,6.294923877261665,4.678608406336808,5.820814861871868,6.244795829014549,5.927614734841029,5.39825857487954,5.2121515922606205,5.922294248683895,6.178437639579981,5.898992846386621,6.125410504077801,6.335621472249059,6.026599873676943,5.782002686695933,5.726549859612842,4.7304229264499575,6.101132890523604,5.567482261868649,4.696735001384296,5.177999592212754,6.083754400060496,6.139476318396833,5.95184515657781,6.166644917819731,4.920830835124537,5.862854877264521,4.7603818061854755,5.96423386949454,5.707997172906269,5.118626318306334,6.471364395360969,6.430073868686984,5.982509334294306,-0.37705520811627424,6.348741128666362,2.51720248444283,3.587867145228474,5.001721488833198,5.106795985093559,6.435617414049209,6.166591219788934,5.292312418670479,5.672510556782477,5.717003533003363,5.231050516974296,5.082593110072335,5.232326371806651,6.079082326342166,5.159506905879945,5.02738443605253,4.826187685810976,5.639437839772327,6.078424180805892,4.174732577774226,3.5622808135351307,4.9464696402029364,4.9003705472890955,6.09650718973165,6.253141949819774,3.983462858476004,5.718460891999134,6.473585981672199,3.798778882000521,4.832547744233784,4.410198101164248,5.815590086923666,5.415326507523208,5.650515854956826,5.637625650879309,6.004443327533913,5.998100939793348,6.0935156164040025,5.511744181272638,1.8231152381439968,5.730267033125729,5.619651150423747,5.2629485439977,6.178787818787988,5.242214819094967,4.3340102524585555,5.710616807312568,5.600956971203363,4.160339426751771,6.078104270105485,6.192056487249301,5.815517899104499,5.507124197617996,3.82960486383976,5.667908281214791,5.916847339659027,5.221144761414034,5.868960725240766,5.414502617542603,6.095648195399163,5.252953332661193,5.870506639794119,6.237963672328143,5.1466573075135305,6.054975261817253,5.787983566689936,4.892427558400323,5.747427523085772,5.454764189620693,4.844646613415222,5.793414662902295,5.701617848296258,5.950039317517428,6.104089993998163,5.850505283651172,4.298469488111145,4.554816676333437,6.202879024703378,6.035679784216482,5.8212673413268154,6.039112820035045,5.479010238298018,6.111632194561263,5.567722326122935,5.91082309271743,6.60002400015849,5.890952301336811,6.06650137626131,5.974457357924894,5.629989464236782,5.951754790230638,6.279574805386825,6.161063290720873,6.271791452776817,5.597950956642106,6.103931375133313,4.398270532583564,4.514947114495687,5.977638380814705,5.99136405546018,1.4001712231631116,5.6223702624098335,6.072835754589569,5.552174043657286,5.531838534812356,0.7266143529722112,5.764294026852665,6.053814383452796,5.186176617068894,5.685579878562858,5.669488962232806,5.969412319124764,5.377810809586709,5.232255947025931,5.35946323358314,5.742402333582485,4.0917242397775375,6.108248253421518,5.5853694796349185,5.613327882853842,4.670384996187126,6.176927944442681,-1.928203963566048,4.105889688823118,5.209039537921364,5.765477604862572,5.667051071742162,5.815037595084844,6.16003943065574,5.9101934896882975,5.012895566030368,6.280701512882695,6.140460433008983,5.654846879202522,5.267299696454885,6.318471123661468,5.716919813005833,4.22138778694763,6.029609596276972,6.217592089346041,6.290618526321778,6.333242621489012,3.2158203516854496,4.293113645225816,6.239114303539522,4.985950221831926,5.797575402488591,5.1695776298774305,5.2441995808616575,6.249011410740717,6.488904687276439,5.975374842451133,6.173569191862183,6.174082744207467,6.0976733873593805,4.24362175441671,5.717599734693793,6.259734997830435,5.073219753436579,6.0876112521803964,6.279275710576992,4.9902021592315,6.571628569781362,5.947267254888978,6.077452533673976,6.197892279635255,4.546173904478232,5.065799605242484,6.013508663716097,6.383716276067573,5.113168565539884,4.321700237591142,5.6900504833246,5.533572260901198,5.520796796668265,6.379800416823596,4.606965832074617,4.979027158667115,5.6476780739662855,5.826389931159369,6.473500274418895,5.725605740799385,5.590853297938579,5.924991499325472,5.379168051366665,6.35564539681722,4.36378320057196,5.7398103123984665,6.5237409615951805,0.22172884464500017,5.985807892516746,0.1783134958420762,2.6433254128427834,5.223078714708254,2.8131985086528606,5.884984675960382,5.490160697390631,4.942240426940325,5.302454202092415,4.346689521505309,5.411232466580486,5.5231582074344185,5.8212793368037765,5.94165674053984,6.0848143711467,4.907865470938575,5.699250553780528,5.239708045918272,2.2046962034669817,4.718361473738206,2.815356836610384,6.353143827520729,6.248035595718342,6.631557191934772,4.9518044877922085,5.564108730789768,3.317907519523685,5.787020036569407,5.433771000314178,6.2657193271847,5.606963548327279,4.599134200720876,6.02827206998643,6.025639030533775,5.579944356028333,5.16681456958636,6.040236837081587,5.28446412983042,4.540771922144502,6.188086426172471,6.0897221800776205,5.86850156275007,5.43311677223688,4.737826484329143,6.0734755247213545,5.631947286021873,5.666622720084831,4.003897141541754,5.753476244478331,5.738207237802828,5.2797809390879,4.982922704880599,5.745900419361716,4.601539267424477,5.5257476226307265,4.726451365314274,6.004733413932367,5.294615641111402,5.971711701547593,5.697566559762905,6.629677959133112,4.788419948956085,5.422304126976887,6.0077558704683724,6.180899189156769,5.687222131841861,5.997118242662599,5.5168942774947505,6.11475475009132,5.78984450094277,6.000198637932964,5.201292289584492,5.827468649984967,5.687480066803671,6.047544293285892,6.250323616939165,6.352227249166933,-0.02450355723146741,5.40825926921819,6.396666489341381,5.719776581845536,5.273431974329785,5.908599456017085,5.39418309730643,1.948444595263503,5.789966697827559,5.241985561110059,6.143801098355509,5.9935145057071155,6.404711394972304,6.310857103560715,5.613622962834976,6.055682544057476,4.656495822317166,6.285565909011138,6.384716274381762,1.5539664999409868,4.954345838577902,6.27319904874602,2.3950730059298255,5.754169811757741,6.396516918373819,6.1238813047014755,0.36211901225084475,4.778132377881416,6.05313353138192,6.219973224217266,5.905718896579056,6.153818202710183,5.178466234215106,5.885867931535648,6.307505715894095,4.456055787104095,3.9762286244047145,6.034555437538717,4.427103678764331,6.230410040134865],"z":[2.4499969051183905,1.4933552550415163,2.0225521928953643,1.307996765636691,1.6179425834727859,0.73856223043978,1.6761333775061713,-0.605643442169017,0.9980652224188974,0.08375375991238453,1.0763798250164862,1.8526701847730882,0.9373518171972227,1.7950585977159448,0.7104476488907776,1.239503863512898,-1.0361079890669456,1.1206483058659742,1.8061978263674687,1.1177031391382235,-0.27393995703399276,-2.6599386500699773,0.6376171842924837,0.9866405275133351,1.044010610489957,1.8404872501618643,2.713724455620409,0.31661535400583096,1.922435083034744,0.8289964545651268,0.7972357633577268,2.2970675114738284,1.5551682571219911,0.36890011696812613,0.10219438239220732,1.1884054030089035,2.286848270250586,0.6872816183498452,1.6789675460845435,1.851736185650468,0.3779574246650976,1.8999904043848173,0.8905688711336118,1.0751288738556932,2.009420539424399,2.0467102655751686,0.7151182029325172,0.7042910880450002,0.4321331327173479,-0.010574771837281486,1.3146158168928566,0.13672880155138148,1.5145959719128421,1.9265954148037694,2.084227316012522,1.5430146917206207,2.107043602174349,0.7978477849979831,-0.24249343231245754,1.2217290932265832,0.7413798945159844,1.458931461587565,3.0770038310565253,-0.3488280839625071,1.6875449917128493,0.5488859379505873,0.5178679839009728,1.851055170499004,-1.0371664929834359,1.9327208520391137,1.2885043475024487,1.4035480735000447,0.5321517120487496,0.6706616695107317,-0.07136991174540984,0.9617464810485904,1.8985738727363612,1.983604486983302,0.06898871691343846,1.6223442624259494,0.784566288845417,1.7082373694508368,1.7060779543508215,-0.051846662272339035,1.8318688041122888,-0.6288427568104237,2.423041391074273,-0.26047320134548313,1.4242029410909063,-0.389510513247764,1.0473322611541844,0.32035023295359966,-0.4497604648401826,-0.37824329624764214,1.4323356012639699,1.621555315373022,1.7685661510218644,1.8508686939058963,1.9169496821102612,0.647026236206287,-1.1745997729492261,1.242894665805288,1.3747789951870677,1.0359738988937661,0.6294954712470091,1.586782164067032,0.7631293987866634,1.352138728882269,0.5901768167209548,1.4343529721121222,1.8967709781714304,0.7361011676753333,1.691024210034087,1.6905492444947607,2.023595801828578,0.45281419741858614,2.148190425469275,1.0471503543225267,0.8523520225170584,0.643624214732295,-1.012961221836557,0.003545115440010708,-0.3583818127993432,0.32438006111910067,0.9283806763669958,0.6195698249131179,1.7901557593402295,2.153065951781441,0.08757250916542693,1.3926668916441527,0.542667070276183,0.47562332156295245,0.3547526024733892,2.8252036419999,2.069222634243401,1.9175635516032112,0.906298551786412,1.3880439626880408,-0.8304320581106818,2.0322569904417667,1.7747514609774557,2.301114811957534,0.7257039620464146,1.0603051107520898,2.410499264040938,1.8300321656148766,1.6454235393153227,-0.23226932544223317,1.483397222812614,1.950616866613149,0.6298569032976695,1.8608298808076174,0.7356921620984118,0.7762715313735018,0.7611282399213756,1.4743253528891145,3.114443355831026,1.7136724515090906,0.8657937371223111,1.6267714462844203,0.6289202479186028,1.5278161821052205,1.153092989077237,1.380288692198466,2.048522131754723,0.628925997885733,1.8293736930898086,0.8385785778212639,0.9630371860199236,1.1184563319789818,1.3656213357349463,2.2570467788427697,1.6656669250513736,1.2422167373485664,-2.550030250039553,2.497295398057197,-0.6664541580518536,1.387807952528609,-0.0431998781833537,-0.2730626412956143,1.718775382014607,0.7038656680282525,0.8939509060420218,1.7601433442383547,0.9098450564422778,1.5948052495175935,1.5955369619854312,0.02997497596707847,0.979615559441742,1.1707627418430173,0.8499794220961582,0.8970837141273987,0.39529984691388215,1.7563898853875803,0.9000041033775088,0.1286052445285439,1.1701630079381795,1.301525001693913,1.3617765931177452,1.5711619083767583,-1.0901387927019308,0.6509983691581019,1.905748866655712,0.1413168748136061,0.5128146225376056,-0.8052710671238492,2.529711635931425,-0.06897886549143538,1.2376639803431668,1.0568605285253603,0.8766724473647627,1.7333720728576667,1.7349454239767468,1.3586524555540913,-1.2243982702577587,1.4887347995691538,1.6810021586712456,1.0950542728150856,1.3836775067216156,-0.1852083626289313,1.017681132383317,1.5331066222549905,1.3617052101868972,0.8801239186981962,2.144337589253684,2.3888699980434045,1.320959480355807,0.37310134830072045,0.024196604680328943,1.2909120293714362,3.1116274496386724,2.1110258404228537,1.5877024956298997,1.2872148018954777,1.0509452542052873,0.7827641157514199,0.9338804692189155,2.2433110899529902,0.7036870561206029,1.1285962778578416,1.1118119387922722,0.3837940143065005,0.6321593293153712,1.4134908701038043,0.6827491645795081,1.250532963228029,0.7227758309379014,1.3405656467345393,1.5324449749419313,0.8703357270889467,-0.4539952507695573,-0.05756058317052946,2.3791596706927143,1.9429982155085896,1.1047775959517647,0.7146696732244622,2.761772577040056,2.289898358079351,0.7493918596836097,1.3455134443864987,1.9786733931193081,1.623994806491444,1.8743562421932045,1.6475054580751494,1.048247813032336,1.024361570491999,1.2233273012309225,1.834088918962691,1.947349705969747,0.7195748741253848,2.3570212865824955,-0.7039358401579678,0.7191564267040701,1.1930862535841402,1.366175916670641,-1.913302758350935,0.8018583785267559,1.2628980440299784,0.16838035739772464,1.9203520424063947,0.1498317269255985,1.5237720967253376,1.5409254464499298,0.4471639441586233,1.0636130215317372,1.2781982425081129,1.8557560170023857,2.761563144322655,0.03130747899669005,0.5706745286331926,2.574693618087721,-0.24353066681599309,1.9558013134035181,1.1479237261101525,1.9046429674506498,-0.29383616721428596,1.8742705083659157,-3.64454696660891,-0.7923270754655315,1.7394412563212893,1.909011944215935,0.6442505971900284,0.786262312403208,1.8962416390494379,2.414516114914844,0.6842932952821096,2.161953736164926,1.4233895097674074,1.2900322457593103,2.0695965257928317,2.6614556923589308,0.8623583573778792,0.2483602115530636,1.7171769509388353,0.9751405924346438,2.559000812345574,2.052492826427364,-0.1440948328764456,0.1527888491807707,3.2194102689141544,0.06932396820087759,1.1014721107904635,0.4414377552279127,1.3503297650810375,1.1032854353358448,2.879656354147975,1.1105474386048857,2.222905441745805,1.2418479869267454,1.7147510982331835,-0.3611684620635921,3.1433582760498555,2.134227359776865,1.6160779831706957,1.6733687578333343,2.426697881309874,0.31777802249951304,2.2324193020587515,0.8280557034611924,1.9149506250099821,1.9173807581652218,0.3076551701360584,-0.28487733607842497,1.987615511902781,1.9760821037058915,0.4830270533512024,-1.1789945792473087,2.7749114874187413,1.1385438204722707,0.033537829265358705,1.8238171080547636,2.8098024560604054,1.0847515733479989,1.4176126476891235,1.6494303734706328,1.9560587406261467,0.4828961482727508,2.1557986130562123,0.6892592193156493,0.04000059478310448,0.9308041834152587,0.008033444939567879,1.2757896394081503,1.7916732532584143,-3.447779483438744,1.0840678142969002,-3.80056725470135,-0.7534358873291448,0.5961985075890787,-1.095309185198819,1.6409069278557076,0.03231043650731616,0.6832223683739017,0.3779288983801981,-1.091750790461901,1.1822238718641616,2.4284220433259214,1.0070855671761583,1.0036786845767147,1.294411153186072,1.373525494081493,0.9663539232639937,-0.10120902294547211,-1.5596859780129935,0.842837594842699,-0.6188109197178955,1.9790815733283234,2.3234438721100727,2.3199511540997384,0.4222876629474463,0.7259826095830361,1.2919704746691696,0.9712937855580734,0.48447918161575554,1.7520325129983467,0.821257587385196,1.0348361014256673,1.3594070041611388,0.6651337387139159,0.34098712783502283,1.9175212090273728,1.6824787736107583,0.19327950125866586,0.015135504666248738,1.939724459429647,2.592800375693225,1.9018043130807385,1.1385792067617948,0.9385534096055662,2.969162526272943,1.3291603921107606,0.6495752453197943,-0.8867365643060596,1.2544491428467435,0.6653650136786625,2.3037153685059844,0.5271613666129907,1.2955502066680786,0.6182111726910722,0.7631165589386277,0.7766937838787562,1.1386690653502636,0.3310732983392675,1.46828903153296,0.7169169088891063,2.0199768597368872,1.5550478179442013,0.6921663838952866,0.7993890954863005,1.6477934136156014,0.7140262647192588,1.9547034355873947,0.5708129138435237,2.7332449433474757,0.6912057273107283,1.6918529711261956,1.4372026163301168,0.4266843370709995,1.1426582504470322,1.1612434391997608,1.9394429884202347,1.7201666831600315,-3.116016184974875,0.5173566473458722,1.8993833686995871,1.3369766964346557,1.1890990191448687,1.0828590678477885,0.10575543624806194,-3.5178984718754247,1.3584046148137094,2.4458271497584874,2.5309925217107865,1.433166389468551,2.7939806188522924,1.6352510290816438,0.3176645569344016,1.4584891012815702,-0.12393386256917366,1.122847180747167,1.3731813938994237,-1.7531712813035483,-0.02729152661126054,1.956710539451482,0.172972538860785,0.8169757310057383,1.7995757028624442,1.0338764898235655,-3.204488180924013,0.18925486641149225,1.7182084009498129,2.556288087995871,1.6091992545760345,2.306935326259381,0.679000369121255,0.7217937760660902,1.4124391371128704,0.37400760051402054,-0.13957872982506783,2.2407700582263965,-1.6051044694495997,2.6571016686347737],"type":"scatter3d"},{"customdata":[[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8],[8]],"hovertemplate":"class=Bag\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Bag","marker":{"color":"#19d3f3","symbol":"circle","size":5},"mode":"markers","name":"Bag","scene":"scene","showlegend":true,"x":[3.0029952363849093,-3.5062275440656103,0.3609144126403999,0.6187827355256156,-0.5670336299679128,2.551592287636412,1.02145559426944,1.1050098924654383,0.11500798325876124,-0.39274844173459444,-0.1991472032073565,-3.9887932088054883,-1.9152996376044487,-0.6918616387159827,-2.6164173076738044,6.769380959819057,0.9556289653826436,-4.808928976476689,-2.109935486921212,0.9046466618207037,1.6834208017151924,-7.187349013090811,4.762971706239847,-7.383554585376007,-3.824902272275318,6.0655289903611305,3.9263238549309074,-1.5543067110246567,4.204963279937503,1.8817074161466674,5.328743793546443,3.2056747439138915,-1.7751784996684532,-2.682690949384078,-1.4008206022694156,-7.5508254487244715,0.4796471334157786,7.223606578100641,-0.6929129091647586,-4.6220873422448285,-9.112811749154215,2.633839055136902,2.8421865751019872,0.8247236807168131,1.5009134691711477,1.716996508398704,4.291927039478863,-1.4645829271902446,4.172314396538693,1.189283413456843,-8.611197137917193,0.043810541339808065,-0.6903890030028592,0.56582018346478,-0.7306442064622575,1.4711172277900517,-0.008497632721342623,-2.414725234037675,2.4902132189157196,0.11387671336677768,3.54571632389849,1.551436686508955,3.676903301177044,3.0871923855287458,1.5198996441283978,2.7644664737035654,4.447137136800728,-2.356100650013296,-0.9194471664033198,-0.09578541121063246,2.159387775705175,0.12133392240426037,1.2356872931309493,-1.9424851398760992,3.8022927382878158,-2.393052113870691,-3.5659118949895463,2.1477931048485597,-1.0042378537505567,-2.453419666276379,1.0449464785521918,-1.5158807485403594,-3.3615771311835108,1.4633064189874647,-7.008367145413583,3.060126984400639,-0.8976534102128392,-3.638206867373342,-3.50320423805282,-3.1054674340615525,2.100884855725117,-1.6296147985466967,-1.6125796807320327,0.4064301390265872,-1.9634386765001746,1.39501292125743,2.2722766643273467,-2.081495591111336,2.28442393888005,3.28244957128628,0.5485207757901265,0.27717692696994994,-2.167547372300048,-0.4168325432937854,0.612728250741925,2.530903245060914,-1.153013059290804,-2.9642838942801992,0.39065155443845534,-2.246770964468386,-4.0062290546135175,-1.6318587063755892,-2.4869257388468027,-0.4279631755576875,-0.5967983615855156,1.2399351347780603,-1.4804744950098312,3.147773784712138,-1.4148701202957368,3.527938319948923,0.16746842572264165,-2.4667590902477783,1.4840944732421666,0.5030688415440402,-2.288544188886825,-3.0734151416775632,3.332422525141045,-3.0000243697651925,-2.528065629861123,-1.8723259934819898,-2.867899249878305,3.8064570184103386,-0.7177310348693106,1.488475941582637,-2.9747702848206905,-0.17499670169640544,3.91590422570774,-1.6454791523439052,2.646285177824049,-1.2913962573434055,5.480084501501562,-3.070685906338733,-4.422973881852161,0.7083464313071435,-0.432262125574539,1.544284010517007,-1.1969592099528008,-3.657008897632153,-4.763900894562073,-3.215699094729098,-2.409550374719626,-1.5575356570257601,2.397832708674145,-2.628214325715044,2.372577183667238,-0.050306461007035556,3.5132605686678744,-5.317240210163304,0.2629812007571706,-2.3065620542308536,0.20627425795023582,-1.346379749623547,-2.9921458062785455,-3.9857387650735645,-3.156808407066948,-4.612591893027533,-0.871228938286682,0.19239648367604478,2.731826794783525,0.9028460916788865,1.909680873561405,-1.0475399996612016,-7.033106038615648,-5.921739304221913,0.5167851131539143,2.3713154195964474,-0.9251208505532417,0.1338741831148648,-1.6206525018818896,2.5449885149802025,3.859589195195631,-6.365659533652293,-0.8373964204884359,-2.1961272701991406,-2.6806783295618435,-2.183261600318746,-0.28721397971712276,-4.5336979239472255,-5.149272793426109,-1.246594782944318,0.17573990575764636,-5.2213198615102465,3.0056437939727196,0.13115178999049648,-1.853477310738255,1.8027180978279231,-4.092879742637713,-3.2229206270188375,-1.2209590055748265,-1.898440626149028,0.31215754786293337,-8.012077927675955,2.219233739282802,-1.5693928719712926,-0.8447584230115405,1.4946232708467564,-0.8363995964280335,0.7515207321084403,-5.941903619553871,-0.737079808825689,-1.6699954467406917,1.2619158805562052,-1.1600223696578247,3.710347225470193,0.60793617607457,-0.49453493761297007,-1.2602960685797764,1.97639160398647,-0.701227197032656,-2.2272991831891034,0.4756617130599654,-1.0359164515307535,-1.3789952450147676,-4.500935691519482,-2.371923869784302,-2.245133339934523,3.20105711580772,5.593903345264811,-2.590233237058701,-2.223742578070761,-3.37853724587675,-3.5155591621864284,-3.8680732663607156,0.7583423441266838,0.8589033802276155,-2.3290606997717513,0.7575074248977356,-0.8387935481935105,-3.093732824976154,2.3278209340409384,-2.7764559200769177,1.2653730331472473,1.700565763792151,-1.1936839769046539,-5.333635667500085,-1.8850882389643118,0.2433101015154588,0.9794289892056081,-3.4787021527364574,3.50243431305935,0.8476367256329544,-6.656962938601304,-4.825375882612246,0.35640680957185095,1.3771898173907096,-1.1517089979890895,-1.0647087528563686,0.018410179914042875,-0.19895457954307855,1.0828674886931577,1.5844287269691777,-2.487873185624966,-2.8738006062366996,-0.510553553523154,4.562145179068748,-2.650776737454199,-0.924464380857194,-1.5934675452524711,-4.324214918953641,0.2818100866810869,1.213820027756245,-1.417217759811801,2.9328381195322524,-2.752605966737013,0.15783380638550268,2.9570678646830624,0.9487580188021802,-2.663694384520245,0.7669483835873416,1.9329897462242804,-8.674625895707484,-4.184292078378243,0.5474407745311545,2.8726748146566745,-0.3314102699844344,-0.2916675382808311,3.834443651410616,4.459858274425971,0.017475726727330037,-1.2858901548257604,-1.052863762680496,-0.727791991810685,-1.9354979337285405,0.7830600855761819,1.1774154758516209,2.041201073522943,1.8011370418076174,4.40517841034632,3.4786303242387433,-4.535758322226757,0.5071847271915715,2.145080173143577,-1.6683056238362965,0.9864284646203141,2.3183757185056635,-6.630792107374055,-1.0196925206205647,-0.8661870364240765,1.0087074332606567,-8.666045037434438,2.3552789807199592,-1.9910182630600834,4.262790180285923,-1.203704653857202,1.6405810408124248,-1.776087305864556,0.665149979579929,-0.21210824858873414,-1.0720500636348862,1.0288104486452356,3.198108514061369,-0.03640606981336626,-1.2104842489115326,-2.1420076910844323,-1.1925149235059915,1.3285223652160927,-1.6300090599915615,-2.522066824341901,-3.426325919695482,-3.047103358109551,-1.7768725230011924,-3.76469367619138,4.637117888796264,0.09665550246204281,-2.875020787239921,-3.1778521491180176,-0.25156331588252734,2.29396824706623,3.10330760079197,0.8244991370097668,-3.8046078785687265,-5.4960744460946485,-3.22352466792801,2.2811380059078963,-0.85650340908774,3.141646895834933,0.05986417064014502,0.4839723596109116,-0.4728354490795207,2.56133118996222,-0.6624547049013417,0.9402891881576733,-0.4302496363658482,1.1889134572199667,3.42762437234809,-0.36114806362168334,2.8846197966558655,-1.8041329458856445,-2.407948198148729,3.109732559440801,-3.803751618872705,2.113674240950928,-3.2770712541824194,-1.2557509332891776,-6.3928841813003405,-0.5897486021824823,2.7533627049935108,0.49581686271376657,3.1359738981399454,-1.42910582361707,-5.370122769558008,3.3447532383830603,3.18487524184346,1.0428760315634178,-4.24702211512328,3.3939492192244423,-4.551967067252017,-2.0209948329774146,-4.831383010639609,0.11617619844763687,-1.1961719007995566,-1.2163397484929317,3.6932932784380736,-1.903383113275066,-3.0556498582198697,-0.5609510391742379,-2.0928246193558433,-1.9767020471711012,-1.3636582207965595,4.987190424005733,-6.157150445456575,1.302729276273885,-0.3399458907964624,-3.737781769438715,0.4485513376848879,-0.4837327624071002,-2.736806091590113,-0.5220942407580467,-0.33399991641814714,2.4526048140188452,1.236913894767469,-1.8804235861111376,-2.1011110934257813,0.42286994019180824,0.8099628327474511,1.002183305668969,4.812675579260745,-1.0684677453897116,6.027629312847468,-0.7223745738227061,-1.839256147881233,-0.2114401433821113,-4.747451342564153,-0.2021684134129992,2.676894817418596,-0.6324114195586049,-4.073569799107295,1.08171172248873,1.6218478372253307,-3.4079746068886214,-2.1292205557036734,3.4722660313385747,-1.8730086399730825,-1.39156630291735,-0.6500243103404952,-3.229395830169146,-0.4596748486884735,4.914730329047867,0.2349620276347437,0.20731526249035745,-1.78843915904308,-0.9352249205489153,3.292307354071788,-3.6164395536986373,-8.596373860648647,-4.677474995939804,-1.275540914377149,-8.122731480942317,0.08124833418215395,-1.2479610229557903,-3.243232365134285,0.6386446420381791,4.222843595091396,-3.278697789820664,-4.226253590158999,-0.7593021466471186,4.735335719136579,-1.6318724172597787,-0.783477902445456,2.310112714650007,3.1606986512635302,-1.5620130812311475,0.861963119302343,0.5293346162991369,-0.8454578668133601,-1.3404888209707877,-2.7412693625673086,5.072224505243112,-3.477523554994358,4.450632948321461,-1.96146693139875,1.3880073412440295,-3.84873698373153,0.4685156527620941,-4.240944088653898,-0.1724753115161166,-1.1909334613191758,-0.26218307402210694,0.08868766066889396,-6.953306086636688,-4.434688681411746,-3.8922679501685318,1.12813747441666,2.0299242773015096,0.89505479039063,-0.15867727088249642,0.6390494834565565,3.0609421423210836,1.9312156439220847,-1.3338498532147842,-1.9676217907522597,-1.1895981853951534,0.6537945908066655,-1.1216099037497549,-2.6875269745156167,-5.5624272546640485,0.5813503785252523,1.4063630067330872,5.112699807662787,-3.7514582642377836,2.696158490358496,-2.0774046228663314,2.9233295006663207,-0.3314319512501405,0.15262600402883444,3.3105255528955193,1.5036337554059211,1.3410504693330638,0.7207176775581994,-4.390688802809741,-2.592369778521141,-3.1549804237815953,-2.6409473011219786,0.38010905385770966,-0.30699987609721313,-6.050479427283993,-0.5334673430580739,3.080337708181261,3.2996231218719534,-3.796673235539401,1.990300565173746,-4.79424449197383,-0.6933339932825759,2.301210398834319,-0.9732967090893292,5.599929453660201,-0.3631288215875919,0.48771631962513046,-1.4824468741824337],"y":[0.6624385874041103,-3.7444981624685,-5.796990066393722,-2.746259924047278,-3.3628405168473408,-2.440487125059445,-5.997780426133599,-2.212053545881228,-7.128114857047447,-2.6531557623609014,-5.781735079519133,-5.188941064226486,-6.282522120992357,-0.8899712458915762,-4.674242184684788,-0.5947251962290895,2.0910383564072004,-7.33336557774543,-5.771747159583169,2.2859183763587154,-0.38082224792258956,-4.137201813498924,-1.11648259469301,-2.723771405089423,-7.079858899141493,-1.4235203632468694,-2.5049650540798254,-3.872084980673895,-2.3093367872115094,-3.1052482829794443,-1.3667438956427487,-2.696928208996287,-4.577204839263655,-3.1887728556156634,-4.320269958275641,-6.180001190828984,-4.944019719472617,1.1028403336777306,-5.502396275538304,-3.849065492375482,-5.6871620400350045,-2.559437026715814,-3.0905145789829507,-4.0799329638156285,-5.182120453187641,-2.3328185440390765,-2.3758218960377584,-2.961500199600295,-1.206031598935639,0.030667831735807935,-5.824217845822989,-1.2367114255607914,-4.899029783265402,-4.452164415718308,-6.254334426461349,3.045218579207834,-2.8791129091515795,-2.3636452712788243,-4.945255707576271,-1.0572134320317894,-1.2804819899436668,-0.7775882697569748,-2.1734931901327985,-1.9101218451776396,-4.133524003242446,-4.588460117928163,-1.9424877349968719,-4.563211133292236,0.3145618776472717,-6.1483718200431765,-1.6858839367236576,-3.1553606230590043,-4.83055016824441,-5.125898350793262,-3.1913643751974368,-7.086654780070899,-7.695050489321652,-4.309137607044627,-6.239919266157669,-5.444064280068671,-2.049840179387456,-3.9737698376507233,-5.947428443713434,-4.907289851277169,-5.326892677640049,-2.4705785373320155,-3.512411581127245,-3.322262956335522,-5.702223562939801,-3.888360699819419,3.403009277856486,-6.3811083340148045,-5.55185380726645,-3.295646830634216,-4.70519650505563,-3.2578638543313736,-5.3779316509185335,-1.926854613695348,3.11220543273522,-1.7829943931278904,-5.366359278974575,-0.5218201654702053,-5.581815058541235,-0.0403383731979851,-5.316900641357386,3.1496501998066084,-7.447475860862432,-4.047977643282174,1.172159399135799,-7.380161062174522,-5.85193813898214,-6.30969321597477,-5.204833623800709,-6.5942230326822235,1.8042189052052913,-3.3158747554199595,-2.2201805667727394,0.6657564382956934,-5.785091389235063,-4.148931744627949,-5.293991572147049,-7.039564493809922,0.5987567291517792,0.8993143034327592,-3.7601316897466224,-5.669959387665013,-3.5735278881628325,-7.024961469174399,-6.398894385710299,-6.5383963332324155,-5.27143234686622,-2.2522352102640575,-2.2401616649859584,-4.3005061499378066,-1.0103173154640488,-5.300164707560393,-0.1741504447911249,-2.9615434149237663,0.24174415250489611,0.16858766808530648,-1.212929963183725,-5.580303393031603,-6.658857146652486,1.5321433003516316,-0.4897815965810048,-5.202596317143386,-7.347032989320776,-6.75333578386268,-7.4514078960386385,-5.301217843892471,-6.8763273700892364,-4.668692852102169,0.3441822009482069,-5.786827076896705,0.7697177008971577,-3.3818247360918594,-4.254310314562791,-5.5333946215541205,0.36981258394029265,-4.816089067765673,-5.6890066793475675,-5.785342535294895,-4.437991361239178,-6.655569818863742,-6.867907694576525,-8.267700361276967,-0.669252766446357,-5.331550876844635,2.562530782173097,-5.927510131664402,-1.1603668002135632,-0.13586399277501274,-1.728990453230632,-2.439180630065536,-5.569228693825444,-4.894647575380548,-6.400717346375657,-2.7275163073926043,-5.114713013069282,-1.5660727188255215,-1.2691247197533313,-5.430184552807532,-1.7828522532753797,-6.818118123673441,-6.307109103595579,-4.633691463648789,-3.6478854519819044,-4.243189161671116,-3.5253394838398995,-2.9402099438899865,-4.23596980237328,-1.2392476282953073,-1.285512943879731,1.4218034419646872,-5.563125126925744,-0.39486652644549874,-4.144091351975999,-6.5450046069355325,-6.142027629375291,-0.5918766649319767,-5.153556261423968,-1.377989120456135,1.7020365312427224,-2.521913150518339,-4.495059148542382,-1.186942901966943,-7.08216928819696,-4.339525261814509,-3.4705325610205957,-0.33754920503445734,-5.877268006798158,-4.560025653813174,-5.546443203151417,-0.960899670306369,-2.0899077587581245,-1.694525798825337,-1.5044149206661739,0.3276795718252378,0.6506397194322321,-7.150153462267135,-5.184300335935941,-5.3847214525032285,-6.159998044941682,-6.169687974251016,-5.330385297880111,-6.575290321365384,-3.2308559354636754,-0.3594492815824911,-6.01231090589209,-4.381021117984036,-7.858013495618046,-6.994909717539725,-5.888565296879234,-5.117895940114275,-6.1357082872193835,-4.8645141223650326,-4.773478730047842,-0.5395859775269954,-3.159627935737212,-2.8013616699963997,-7.1966443789336445,2.011944157501231,3.051061515349172,-4.155751268715318,-6.185182412419121,-6.247742595224738,-5.057525745230949,-4.469981284171067,-6.994938667978771,1.9325428220059586,-1.2201787579845162,-3.0870125394344083,-8.468907508624458,-4.019190560636902,2.955602984021439,-5.733749672294367,-2.8272545198367647,-5.058166411346967,-0.6578740965339177,-1.9025545159732742,-5.986637522925498,-6.056906301439286,-6.899918905790847,-6.770207535857897,-1.699856952696844,-1.5765507926966202,-4.166572748744736,-7.9351656687807255,-7.463303223369214,0.8030053754668853,-1.9980892758407973,-5.202210486502381,-3.855576709331636,-7.651793317092549,-0.3347223589049905,0.6997994655487799,4.660415065724651,-5.634428262434888,-5.415443380549346,-4.658069736338155,-4.149669841698437,-3.912157239280731,0.18328329617553774,3.424374366901343,-1.7013383434627989,0.0534078718331634,-2.6210167557765254,1.324691150782152,-2.5136973042115645,-5.3626529814910535,-5.901225268586125,-0.23155906409331775,-1.508290698361301,-3.08685883241177,-4.517913753281374,-5.966917602181576,0.5685189695296244,-1.8780260748591167,-2.766519083162444,-3.308810103670776,-4.531785453797038,-4.7080518964293905,-7.670973264817868,-5.43176946467207,-4.2847376601444855,-5.346873344463188,-6.3268679059296,-2.7067684961715606,1.4862005701605447,-6.515534626969181,-5.117988415213192,-2.093211532229863,4.581546238520769,-2.7589026841044033,0.9856608582923443,-6.141121820030085,-0.6166273548223326,-5.597275949239351,-3.914152765607455,-0.8951680058968127,-4.649982066347104,-3.467584297050619,-6.242819805748535,-5.537836144647242,-0.10131602043110327,-0.6640928037148347,-4.774060516298583,-5.816862892076242,-3.3136717252454084,-4.661179966666491,-2.3165243649424943,-7.55193356168059,0.5373824470714591,0.945300536136717,-6.318963458215115,-6.496138685798032,1.370305588316425,3.423556160364014,-2.8258343610343477,-1.7804478647413386,-0.9561574423325561,-6.459414528060225,-6.83683376028662,1.5396265579446482,-7.4740712542346674,-3.1358772976884577,-3.762114958298644,-2.807241185315705,-5.088225178106421,-3.110222230576723,-5.30816327506448,-0.8275708577194533,-3.8613890145185037,-3.326133823981544,2.1438715656888934,-6.110755728072046,1.0349570549726987,-4.562956271109838,-4.976205063838879,-1.5334033285894793,-3.1552057641383873,-6.067500116167797,-6.039837246272871,-5.689832334888702,-4.682395551650895,-7.108441774685592,-2.5201950710161656,-4.930715940024472,-0.8983618683901873,-3.453577518701047,-3.649036073770727,0.26527240616781006,-4.745711110388754,-5.568777244182971,-5.60881399227945,-0.2274312338813025,-7.4255049576698156,0.7469842751780922,-4.294741114836528,-3.685226379362232,-4.707211380919203,-6.403901292899341,-2.7193884915074293,-5.945683921666892,-6.158231481510004,-0.7919390411408793,-4.799282256671519,-2.1132160605691785,-4.4401340613567655,-0.34189772852489925,-5.652848388856089,-5.483077341535205,-0.8999819682921837,-5.599334137030989,-5.673941875051741,-5.0413542972740295,-7.010400535818997,-6.2111052109791585,-6.24728401028592,-0.7964237523584118,1.8868384547686783,-6.400600412600043,-3.2628511563638543,-1.5278048053223385,-4.411548571063097,-4.030885891423592,-1.2847650221723779,-6.544297670797444,0.5776733613329882,-6.5663996267757385,-4.881681296687994,-0.085561202252928,-6.866584357537687,0.08771599190531683,0.2794919745213238,-1.604369969549191,-8.204814037788374,-4.782581854440543,-5.839970688920816,-2.8492362998530707,-3.9951959544355495,-2.7214569529004375,-2.0870294189273304,-0.09979852398883601,-0.10919320877239083,-7.004767789964591,-6.099027162132643,-1.5668651532718783,-0.7440214562144325,0.4949542243503899,-6.908843708369035,0.8081402412914543,-2.4097811745936513,-5.404356411916202,-5.808238333863637,-6.059830881476757,-2.3743065409852617,-5.254648979817617,-4.657577737058924,-2.611135022423442,-6.9878461138846895,-0.3907765600360407,-2.177074020165962,-3.8819640719265114,-5.989699287191828,-6.036544586575026,3.9015160507807374,-3.889779889434225,0.9038758570680244,-5.036635518262413,-2.610900459113491,-5.043630080131955,-6.340672741126537,-1.1863358019527936,-4.630227980295212,-4.73670716549839,-5.973067559031117,-1.5509198509396658,-7.696942310913061,0.08121359400451424,-2.080405007470984,-2.9452978125709857,-6.654147009057955,-2.548213128323071,-2.545455760803381,-4.813402118006226,-7.800600316216399,0.09306811258641474,1.8645674521042237,-2.2045048615169542,-8.487964422837305,-7.4760366289819915,-1.4536846759198332,-4.587801542351306,-0.7012876543599205,1.6046238706599991,-4.715443133440722,-4.026795371000862,-5.139622421159305,-3.316887228627292,-5.955826137520152,-3.208806163718175,-0.44703397197492306,-2.7866769392177426,-6.197788127640866,-7.273146889218208,-0.8014533975798546,-3.876823795707098,0.8843835362461148,-3.3356465440506975,2.495771606499109,-6.686025165182831,-4.208141849250682,-3.4915109069114725,-0.2806669007976453,-3.4312085390346816,-4.876051489538954,-1.1802312276407092,-0.6522018453664705,-5.324442582539912,-5.861744021688953,-2.7020182731571456,-6.000557704768397,-6.083831064193745,-3.5009463869479784,-4.879599644141152,-6.233226582428017,0.39524744697475844,-2.514294020111363,-7.768829370234227,-5.6014989104601005,-8.143700843977612,1.5004302159706078,1.388332571180601,-0.5210069348879147,2.9894439053344075,-5.569259771375309,-5.4844048684252,-3.2505234986938127],"z":[-1.7951638241974561,-0.6411261883013122,-0.014620924848249214,-0.758848141548838,0.26019911973207205,-1.2342041676642186,0.46739070426897367,-1.3572972433306683,0.8263978633058968,-0.18615248347024882,0.6443860207974617,0.051221147793743205,1.166712321547495,-0.1593037864120781,1.1972964708931328,-1.4378088712375843,2.043232811674372,1.3034631444117781,0.16219730358710957,1.2773389420177275,-0.33180366826642926,-0.32521831485043673,-1.4634295460675502,-1.166704626776137,1.6440178338135145,-1.7156212250229002,0.28366196659133613,-0.042843798841771974,-1.3333402844157711,-0.5622480573922484,-1.6958024764845947,0.00891874486142715,0.48953835184760897,0.38442564969883,1.4124281711178135,1.1851098027464908,0.3231701420073954,-1.9953411442049298,1.1382638837237056,0.43735567956244176,1.1075708228979546,0.5163275576718279,0.35357425818357824,-0.3714652340467695,-0.4886264304897177,-0.07330585525314573,-0.3531222176053132,-0.7725272168396219,-1.1397111193166358,-0.282914072147685,1.0715743917019782,-0.6663632154948882,0.2744175538791803,-0.4738276304832225,1.002318567966702,1.488544501563927,-0.35066107636191973,-1.9033209240436209,-0.8957577127931075,-0.40791233971992763,-0.20377938676611224,-0.037067014242539574,-0.12545970344594057,-2.4656565828921853,0.18653156626862472,-0.21190115334566204,-1.086139010103513,0.5865646974954524,0.6341259873542204,0.5120192304652845,-0.5071521674050188,-0.9899583469341469,0.26978399677492954,0.5516538233254158,-0.5282295518729077,0.8085463616679359,1.7179499812391505,0.5032620413612149,0.8367074786062092,0.06829759222594874,-0.7467557156018113,-0.25931024214150644,0.8122167575043721,0.6383625291564373,0.8999831262090915,-0.3688007720448241,-0.2808435595902831,-1.1586025508554223,0.4988093337592394,-1.769738024732234,1.1799481298548424,0.9893346195579893,0.34198386215511645,-0.8274490417177809,0.2447310649793554,-0.07467750664509872,-0.5388178994694426,-0.4815614522389503,1.7977492638315073,-0.5036176831546618,0.06538115695167535,-1.0149636932944983,0.9387080831715461,0.6921312191582105,0.21478935088786408,0.8927294563067248,1.293551856142756,1.2518603527342616,1.023042123702744,1.1268224148089427,0.7238702741525929,1.1462286473260865,0.4215936119491565,1.3794528627560327,0.9196785878025314,-0.4384584661738118,-0.4024787316372569,-1.8500697681565486,1.4491548589140428,-1.132993681897082,0.6582874678320649,1.239399831972331,0.5693329843678084,1.339332569570389,0.09256127365524217,0.034780096220139285,-0.8213485623569713,1.3970637921691231,0.5593028087852768,2.8784687226612435,0.5475849430738895,-1.3938188138145455,-0.16136869019838312,-0.4992498023720051,-1.31272064482785,1.0634582505064392,-1.6783998420586819,0.5007068448557681,-1.5123062358920074,0.22392824669913164,-1.246448124301184,-0.4518065223773309,1.4349187366270595,-0.27758078108747797,-0.07391752813393278,0.2559536162156842,0.8918153426149626,1.181725893138257,1.3808062402123251,-0.07783666785656371,1.4564046717950634,0.1599800531146701,-1.9094434918074243,1.0035897388027528,-1.618422467777749,-1.1180156259593483,-0.4095257835603548,1.3151661902607534,-0.0858442835269446,0.7340390593405905,-0.14314659050523007,0.9918845018585165,0.5755853698782945,0.9291762271584133,0.46043023433863606,1.8248752987440369,0.41129472514133963,0.8081298209695803,0.2822793634616913,-0.33446521631254267,0.3274576532483754,-0.3336660803815391,-0.6295237842761799,-0.7183974194396339,0.24692894034826454,-0.4862777440381091,-0.16265406494801488,-0.49153546000678944,-0.11175537516027731,-0.9736859019236594,-0.5034527250382361,1.1012346936338615,-0.24222419064991677,0.966111061935463,0.9037775907852025,0.4357699111012726,-0.5826469355074404,-0.5175767370026635,0.08819945170202925,-0.01761130678934327,0.19239588402454,-1.07966925142186,-0.043638908447643135,0.6635575672729976,1.517866073294823,-1.1065138926546434,-0.14075643470226776,1.5014900652675969,0.7361834241357521,-0.09243216378438317,0.77824335762091,-1.9187265092818266,-1.5496916180294813,-0.48647476139935664,0.5525507393946483,-0.5890306295313549,1.234720323331222,-0.03746592624794326,-0.43554009935811805,-1.3701606235705475,0.9803962958213764,-0.5094689434767193,0.8902314273472802,-1.107477206395608,-1.0841527462095137,-1.1593113886096922,0.613375946255492,0.2388012369020037,1.2723293325718192,0.7112588067591505,-0.5980457478184652,0.025885830970768395,0.8426412828434343,1.814060086744494,1.0527767050327945,1.6069241674665191,-0.9261972111608064,-1.5067754320993818,0.09628559758725821,0.11861261640171075,1.547703150518838,1.6235481864401087,1.2839082642974864,-0.06141166514638182,0.081412035931007,0.5426803453398998,-0.3440293566494906,0.708530239045201,-0.3692935631275116,-1.8787072629918846,0.9953070014423663,1.4730599640474953,1.7400387687208503,-0.3236564626863393,0.9173320991992285,0.6268521423443665,0.2020108414822459,0.8799009498417647,1.5500659028961485,0.07037600361660505,-0.30442848222412133,-1.1625114118240587,1.142608961901991,-0.14449924482784468,1.68964906544441,0.7341862190962788,0.5899245479411501,-0.054875540000249745,-0.19278477092009905,-1.8610355905385028,0.5668523702206694,0.487037988240426,0.859408089842163,0.6897171627104579,-0.419161711613075,1.6338191175251227,-0.2845323391142404,1.4334290606152025,1.8075878576426208,0.36708515909823386,-0.9130824607441953,0.26830980310742114,-0.0005929637129762704,1.2302926624183186,-0.7074122185607978,-1.1361789988401212,-2.096518134578903,0.6891119201197216,0.8231276980520015,0.6855493219551427,0.04346576218164701,0.2812483375550456,-0.3905857766552046,1.527077125445425,-0.4637842961349288,-0.06270418220886007,-0.7417249891848148,-1.5654310433251928,-0.28848873170921246,0.4404724167358904,0.21871554104827853,0.7296175205203359,-0.4300092548633897,-0.19763882584871253,-0.008178218092599873,-0.22073798133790032,-1.4601434260527373,-0.995063407775174,-1.899745777327042,0.9333081940656538,-0.23564116964714443,-0.6427008166722074,1.3154899183914937,0.20466995743697747,-0.4426412374602036,0.8680433609812762,-0.10513098504126642,-0.4890488542748298,0.2321495461980066,1.4722831885381693,0.5399181259954091,-1.0486534183092386,0.21781362071887744,-0.5565538214460428,-1.0109454938572928,1.0545657417778016,-0.5925943293408747,0.5060631153817369,0.4132788719339053,-0.804859664952399,-0.2749245630418195,0.6276308630663876,1.096861184852927,0.49873805134490956,0.26862929531137975,-0.5504603477385392,0.7913741396324997,0.5862437148739763,-0.8282501129785431,0.39709894119126815,-0.32054688489559063,1.5215879248304265,-1.7278455979784293,1.1081169893861165,1.3823705227251928,0.8745723662503361,1.1170103670533102,1.5339383658824561,-0.9596915297734707,-1.9540639978378729,0.18503871618682638,1.0837162031994096,1.164875890586619,0.027058828308989116,1.376510951859048,-0.8294665735229673,-0.29907399326231193,0.49341595345968287,1.4288083277924948,-1.1559046131389308,0.8556973574496619,-1.9037580981762618,0.3769765654414305,-0.7659641437695746,-0.6419840744868341,0.11540626719949733,-0.7902596445978535,0.35005131647865156,1.5369030547350098,-1.7329561985504696,-0.3825660448917671,1.0028302626924677,1.1072093381746533,0.0546892345414789,0.7058954527705498,0.5374648497493018,-0.9563964487385371,-0.5997083141810466,-0.38895879874325906,-1.0991182305749372,-0.47198437404280547,-1.8377933180077362,-1.200837127617849,-0.27554695292100356,-0.2696782962560384,-0.5763795292454802,1.5228175826669375,1.1814032219944237,2.3568258300575122,0.852964229827397,0.08664456097961118,0.1672092715041773,-0.7665458993496107,0.8703812878252837,0.7726605969113396,1.932932773253055,0.2519798119846543,0.1794816643567984,0.0062548692361127705,-1.058462780168044,0.3737846165497953,0.22589850599304945,-0.08784290059259693,1.4822739824970228,0.8898962152055288,0.40430002901951767,1.309781021507398,1.010043559433568,0.3635864469367075,0.2955486158056149,1.5335254345359584,0.8520493597394534,-0.2916203604051128,-0.28059226605308607,0.6078305131633809,-0.081223514958955,-0.6944252911086193,0.8967551420885923,-1.4210187477319833,0.8737657669111283,0.38183273092156667,0.6810878859365715,1.0617224883736058,0.2966992045763202,-1.0512843761404929,-0.6670923416443826,1.7357000867025651,-0.1160434011740793,0.46745408650323184,-0.8152681508212746,-0.9058136787879634,-0.30650753547061127,-0.3067538670843889,-0.06922077809769497,0.7009225284513596,1.1727969421295943,0.6874773197564328,-2.307467156189805,-0.5530662556342418,-1.7445607983224685,1.6452751821908271,1.9694036999557603,-0.12079025651198715,0.17242099818203982,0.6244245406885293,0.45837578072338125,-0.9378394776919589,0.14822181184668035,0.10859450621127403,0.09085928119097929,1.7861705882107062,-1.6752291047709975,-0.24184038348536066,-1.595539829397981,1.865220871754993,0.54653649143859,0.6778661908767853,-0.9699206250390962,1.1423297656933917,-1.0832790143739117,-0.8394258582034363,0.1677570407485041,0.16833675134050508,-1.3152431089974346,0.4355840815539448,1.3634539898278768,0.3246880962693012,-1.302813918897509,1.7203008700947953,-1.4049789470209697,-0.9519295263694187,-0.4775338925856969,1.499900370498629,-0.691340134866315,-0.40561978721462244,-0.15558209215512764,1.2678426736981903,0.3490443836830966,1.1617000169098668,-1.47759130201383,2.1480433053841237,1.3749559808898406,-0.5586094171687702,-0.6353892774534022,-0.03365987624761173,1.4214053428022386,0.271090789100551,-0.4259832339502107,-0.3626293660153353,-0.13554022919443157,0.07994194072764445,-0.3058385712139783,-0.8256436709715224,-0.9933696066197695,0.6058046359100733,1.5008084303064952,-0.13076431987266185,-0.07558523894476038,-1.8315717021140707,-1.0002038535900437,-0.2709399749812796,0.9253507996199295,-1.0894226781375826,0.14743474608588436,0.10317487245646696,-0.9794429930561348,-0.047432007037989894,-1.0343909916664769,-0.759707001072132,0.8801425080030333,-0.17722453484996492,0.8634204584301357,0.8727772924803062,0.5001181978291199,-4.248539010657742,0.03969799949312482,0.7177301391522392,-1.162363116898992,-0.5370818374106054,1.5254164896997542,-0.9574582840854471,1.1757230658406805,-1.167713622619626,-0.14309215056647434,0.017788360177260466,-1.4112510645213838,1.0735138707045995,-0.08784334420332247,-0.2469145906784664],"type":"scatter3d"},{"customdata":[[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6],[6]],"hovertemplate":"class=Shirt\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Shirt","marker":{"color":"#FF6692","symbol":"circle","size":5},"mode":"markers","name":"Shirt","scene":"scene","showlegend":true,"x":[2.5877422407139217,-5.913637309059188,2.9285809773424414,2.117582421396799,1.556237883362943,-4.280416412020908,-6.198162202257348,4.253920622374744,-1.7553002312894486,-0.5099636447535856,-4.012677452340849,4.65112071228012,-6.073603503415566,1.689687073641336,4.813194105459431,5.508193804515249,-6.158235419019986,4.878620903915555,-3.375283802225607,4.683892799531453,-6.078521804767256,-7.29015342573401,-7.385824283252769,-8.044239372644498,-6.00436013812125,3.327696703151569,2.4014847898371943,1.2914744543492667,1.3123833334971695,-1.3602011417977606,-2.255806322456993,3.6291643107016895,5.044059006484703,6.450311362620662,-4.004732724378807,-5.78678210667043,-2.686048452960095,-4.936792612524168,-0.4071451970266638,5.076532395471835,-6.171300600966853,3.007522069746935,5.580334912175005,-5.955337123737259,4.488673435863434,1.0829669259369512,-7.95503672201693,-1.154912752470945,6.21191266438722,-3.563018091798156,-5.274404381405204,-6.66528912846313,-1.9480231589855808,-0.17765234550513484,-4.927764077488852,2.1951537245337684,-8.539798354414085,0.12824067584724286,2.172976105702199,-5.1175961677535255,-0.10010842549777166,-5.539375481596839,2.2536876599994238,-8.100698421583747,-6.346153787652092,-3.3769738326504077,-0.519492806381239,-0.5622419414979445,-5.156974099403329,-1.0403137484564906,-6.9959333166291,-2.102724518710389,-5.720323062659414,-5.033236667193693,-7.988461424654328,-5.45628122679489,0.9506791059593328,-3.917515621271836,-2.771944530328474,-6.797341110639859,-5.372249447628549,-1.4821235628887812,-4.517464772687063,-0.3717415019817022,-1.2002521005865046,-0.3779830926020786,-4.848369398387911,-5.62354590626072,-4.771116198919772,4.748294612803566,-5.972305152211317,0.020201605409103155,2.1161363280209655,-4.131034368974214,1.489426654817019,-4.64268625133498,-3.773192957364045,0.7664802795512594,2.2221262040586796,-5.255564110268029,-8.580411752651148,-7.176906966222195,-6.796843895632104,-8.51634052222283,2.842482230242093,-8.064854385618801,3.2069195406354347,6.796725701650583,-0.07384346702982758,-2.4144774185204727,-7.13929700313145,4.844098416274152,-4.073698323295141,3.8507944319832372,-4.410899402354292,-4.645332739177254,3.5190832297112697,-4.248681573596079,-0.44862488115737464,-2.232746329509228,2.494566714240603,-5.431661972126012,-6.166734759010536,-2.4885833353956914,-3.805035569009208,-7.539097342136705,-3.0833348762313757,6.171871308463057,-0.6092168949029203,1.7308851588409055,-6.387431620536953,-5.800701898429409,-8.220934818025173,-0.8977604214710843,-2.1955437696751128,-5.5214811755429904,-4.352384367159353,-6.790673794597388,-4.232519464457929,5.8218764313166895,-0.5703328468966931,-7.3375762448969875,2.912651478860047,-6.0705687306358165,-4.573863496050494,-5.1441527957901565,-8.660188789852864,5.746185150266062,-8.272856273442825,-3.744494886409433,-5.053582683454092,-3.392550386337739,-0.2975919237813872,-3.989528392217366,-4.330968008078638,-8.452142927734796,2.1786126530714234,-2.8216997415404026,-5.961234824565146,-7.206251133589375,4.2516300419943125,-6.06401732239417,0.9595074904395156,0.2351918130495336,-1.357750274623915,-3.9770818269857995,-5.506608697416899,1.2151211451572217,1.7995461551929686,-7.5423076863526255,1.1344619941502112,-5.169654370794795,-1.5947460706125318,4.732917867099845,2.538580873394415,-6.703404336628215,-6.32600272471895,-4.62439048253559,-0.4227659899155161,3.6390575213444576,-3.903273503368058,-5.229510703904313,-9.056068959615152,0.4928124974042364,3.8260760631081636,0.4406070358569586,3.656443732736043,4.34552422877995,-7.88084359652971,2.9553801452938138,-8.958858796158825,4.753277905519102,2.065027642439794,-0.8848677189844135,3.284234266842766,-5.245562084039057,-1.6998382640978147,-6.159601310077897,-1.2259737885615865,-7.440640282720395,-5.460684675066332,-0.8896076111447476,1.7206222494763008,-8.25020319063107,-2.848932852083423,-0.049208585108075124,-6.608668216579616,-8.041453385863376,-6.737576794151168,2.121540258628391,4.776354521170483,-7.760586108364179,-2.177690735840997,-2.873451576039832,-5.248474023554587,-0.13885407880933034,-8.664930724524726,-5.017558309987728,-0.04262999497819935,-3.5161238430182635,-2.70812854177596,1.1893018612316955,-0.05790891594169133,-3.99709070436848,-5.027821245983377,-2.7221781936400964,-4.444887269109635,-4.422825497840773,-6.590173488978352,1.6775220825453396,-6.403139628173589,0.43838118381220625,-7.298512488747001,3.552895841147855,0.6276287294490134,-4.130413661168607,-3.9473578098568787,0.31962166896117844,-9.620790105401118,1.6511354072809923,3.1162426462604347,-2.9402706134876593,-2.729837390229419,-0.7802823202819544,-6.6460660453089675,-0.6501123751097975,5.007384359550856,-7.3213395088091,-0.6262339601747933,-4.021394145825581,-7.271146154690842,-8.078893370732084,-6.94163599932817,3.38839779435057,-4.197629075270618,-9.071552486335682,-7.475601689782198,6.195043574073625,-7.441062597122511,6.844775832256515,-1.4458065824734243,-2.6576181769269875,-3.8096744141753596,3.29810854842824,-3.3488788102072666,-2.7402078847976106,4.6277752505538245,-1.8969305052449403,-7.0660234428785405,-7.035975465353867,2.692879766101263,-3.8073730296190798,2.264298154478004,3.6501687152873328,5.3410444453496115,-3.1638248417583377,2.7373033308766708,-0.42901598229494825,-7.268962534239454,1.961161906754796,1.5385750948223067,-5.928275034987808,6.942769273795472,3.129651526086634,-4.373789885494062,-0.6335999992078629,4.607835073375611,5.232355206753437,-3.0995976462437422,-6.262539359252251,-3.68860921437573,-5.612117933698509,-7.4304896079106815,-8.85479793272012,-2.3720907824435096,-6.85939621474551,2.5155206185124186,3.030912162250448,-2.753346618552119,-4.027614373484101,-3.282613550138782,-7.450056297606192,4.197427560883689,-6.40924101677534,1.593743690531215,-3.938975746574905,-1.4567130719346633,-7.619934505724956,-8.137866353504137,-8.157466317860369,4.978143384030261,-8.24660922526581,-4.652941037353743,-5.176500592152693,-5.180243782786328,1.85811466527192,-6.026314600379272,-6.368159402231922,0.2511444912441331,3.2788139798007094,-8.953009409089395,-5.510793281886481,1.1795952060802546,2.5624222246495507,4.661306024534637,-3.3657395285654834,-7.999867198121688,-2.289015456518114,0.17129680754486898,-6.367339290106339,-1.6043343138243622,-4.601638778090515,2.144482248746335,-8.788013111105291,-1.7813725570475936,-5.080115619011173,5.441134349940423,-1.9749339016052083,-4.017711773958937,-9.42913836754321,-8.83336401446741,-7.061773026894191,-8.4559365571094,3.3028069464693117,-6.7264611139172565,-3.023762079096381,-1.0315382345241724,4.863020282661652,-7.344853980958857,-2.983111267327227,-7.372392448580665,-6.612380996425982,3.909735825318384,-7.621835098720329,-3.568087820927628,-1.878511576966252,-0.2428746858084502,-7.037412095417831,2.391847096985421,1.2313006394017136,-6.918503482894493,3.230782801662673,-0.7670884695928235,2.445343589275442,-4.124395893987991,-6.605969349376571,-8.155125308073154,1.568222709935361,-2.8020540740255284,-3.7781370992375902,-8.252153893665122,-8.009507912703508,-6.794398457581547,0.3744074591208991,-7.164988405187505,3.7493874600885038,-8.63236899491587,-4.96826074533191,2.465359230540486,-2.134113947724035,1.1765393435625302,4.19643654382014,0.27317620063967096,0.06020068016826367,2.8075273720614202,0.37930187293342776,4.204697390456282,-0.20064468535653512,-2.203323677134282,-6.718443054631001,-3.7120166886546038,0.1938553214871861,4.0417197996377014,1.563499815661231,-6.513696305219337,-6.777580861161206,-6.164541655875102,-4.200025459757547,-0.42906011922824194,-4.46380351907884,5.142392526016633,0.48262429561411957,-2.2053902277413027,-1.4463247673977209,5.382425276047645,-6.939152023044785,2.929155878226296,-3.444316365673344,-2.977209398296168,3.0939782634480086,4.9159301371004105,-4.020155237980524,-8.32002847359879,1.810694218309774,-6.577260508024726,-1.606306657444854,-3.0672985042501004,-4.492061900275323,-3.5367491914272353,-8.88350907874936,4.139791999966789,-5.841584887599666,-4.820391695960745,-3.7285329330974406,-4.277370934934306,3.742345041837171,-2.1168644880857896,-6.23298199442367,1.1755062889622077,4.155905023272855,-6.671749900239165,-4.758966322881554,-7.3860528320661825,-2.940978110276503,-3.531183152156326,-2.607564284954334,-0.4328837878855641,1.7778891229763882,-5.89784862682,-0.879507826504811,-6.134790543065162,-5.8899746189653275,-2.8156903552640387,3.1780512612994745,4.149390647628101,-5.672270113645091,5.984688310222661,4.2296273991972395,-2.8819847359772903,-6.380440566066086,-8.22999329658553,1.7722536058913598,2.494272499131357,1.4304501456495289,-7.508996870490474,1.2546370653386862,2.031695833412668,-5.335400662562524,-4.255803594722494,2.2899090224954426,4.2939617584104335,-3.9005191980876384,-5.745522583948608],"y":[1.5204259006707295,1.4725801194827128,3.449773185817579,2.310861022370474,3.4148522479869405,-1.8742812085809295,-1.3378908750337133,3.08026950573192,0.2317598584778829,0.3311528769488987,-1.927216338926969,0.4713523032221928,0.017408886945701023,2.1138489742031594,0.4561959565342598,2.1437545953341264,-0.9684671892319393,1.8309660209899485,2.562506286218846,2.150075334637586,2.081588557850801,-2.874400940688029,0.6981128808141847,-1.117823865176811,-2.1930202181840945,2.484032672094943,2.3957389183257733,1.2864589891411642,1.211947922684135,2.1422481572690217,1.6993425361729217,1.083909852953861,3.269385320459469,2.5471511680893095,-0.9585050252541204,-2.758565141359285,3.687806728924422,-4.595075095524939,-2.5179136926964696,1.5621413529174635,2.0220850395283407,2.59753430009949,2.254528119814973,-1.5054135911893702,2.7819583616292363,2.0507353410965643,-4.685361764949284,1.2626958448501344,3.3356769227740592,-1.3308414288159882,-3.017959613273506,2.1497097232882885,0.07744494903797264,0.637122705430783,1.5999158467005992,1.4584054669869169,-1.8637180314348225,0.17907104489664147,3.289601195562727,1.7613760403622245,-0.028669977573771765,2.0613417399075886,1.388564516909678,-5.310957579306076,-0.8796692129419834,4.149913332784632,-0.5168603769439463,-1.032997246371913,-1.1401221478171661,2.336471136028592,-1.5168476141010843,-2.8179461071325616,-6.24035784732068,-1.3705179588602063,-2.970267593837944,-3.5543006083692408,0.7937031969243986,3.842513376237063,0.15556839550816803,-0.28566114327134234,-1.092410918811769,3.2467970545770863,-0.9568239617419956,0.285040840190208,-0.15888621142157522,-0.999305315051567,-2.5104679127958147,0.5159050448416698,-1.7995233609041654,1.9149330206725772,-1.2315684518805694,0.7109391453072125,3.3820059820191406,-0.7597802192921805,1.3727519255954024,0.119474066217054,-1.1036156023047337,3.665615812141787,-2.056918414659479,-3.9684625962625897,-4.587112904741704,1.748936681018079,-2.237574141463859,-2.471345055855185,1.1918879077641076,0.21340714943525105,0.4417607642233065,2.270589472979539,-2.3459329065512167,2.796994257180216,-3.381167950809562,1.9041690984149502,-0.8551562274948361,1.9742802733502094,-1.1121483288755545,2.2192577891096317,2.9235159419897037,-2.110538218129037,3.7806650444891927,-1.6144327833562213,0.80817217296836,-4.873432443771754,-0.31371918657667364,3.9158832401455443,-1.607675644076528,-3.7763689816755917,-1.7032324226157107,3.2837428740794916,2.7089965216258554,-0.7375367759093424,-3.065297682048646,1.9908764305694109,-1.2792175016393115,0.2516057603700083,4.135448432145634,-2.2816707338733275,1.563259621581104,-2.1214467391284106,-1.8409629604043194,2.3885271131453822,0.15670697188468874,-1.7915350585583907,1.4385948639586266,1.4537647995409355,-2.076537079465472,-1.2543316989871474,-2.5361822896483197,2.5704436325557625,-3.5761204028878075,3.353907473886155,-3.2635305733559536,3.213747503031827,0.31500576827149857,2.7166905472234903,-2.509195967747787,-2.6937850229741627,1.5440277894406131,4.770082107780146,-1.4813562020432791,0.5562010636595098,2.9313474340570127,-2.098729611126675,0.21478897970815505,0.3908551240906886,2.039393680771618,4.127236577754931,-0.49163716496051957,0.2150450758459368,1.5393726224379416,1.0064793588721455,0.8915038234235109,-2.2010786272455314,-1.1883478486268328,1.4135775580789616,1.2399777505800795,-0.4253685801491077,-1.0757678594971436,3.2078997080576106,-0.07661364362138845,1.7197963237656815,4.191099897172767,-1.3297577268559941,-1.1533988530732684,1.1806529687795624,-1.9757680398671569,0.3697890170558224,1.5997308011480345,0.5299723841596119,-4.151590531874299,0.8564721194198954,-2.7767398277935027,2.0686117136275692,1.3197599486954938,-0.17956469236812947,1.493713414180763,-1.5294972960075222,-4.480928292440576,-1.869088252358511,1.97654666048166,-3.906911491371195,-0.35737775609031125,1.2041723297645655,2.6292638629301903,-2.4966324011324086,2.379800261723601,-1.2174281613130735,1.6370066572000153,-2.6888148743555833,-2.4511232163788614,1.6046997420600837,3.0919527566328586,-2.0096587967064763,0.4314612882255204,-3.5168769170155083,3.8691706930497154,4.206533347088825,-1.5500649983150634,-3.7162079313249716,2.938437552958454,-3.334786042914344,-1.3961380902512606,0.8954435637971248,1.2227121379113102,0.1733458807267363,-1.117383149431928,-2.360660913188741,2.152840437755163,-4.412170459980127,-4.042271434228769,3.8910538951702094,-2.0805558892287475,1.1304551403270142,-2.6148950354390417,0.5999787628436504,1.1591960317355279,5.338669357167893,-1.9211318035515403,0.8996548142766568,-3.7220457953150574,-1.5874748347726972,2.087855163356135,3.34539530897962,-0.2875739371980276,3.0297682303960713,-6.918488252841225,0.7486105741399252,2.2053137603702346,-2.974679730551583,-1.6002536112461507,-2.110041569214852,-1.3019328019970686,-0.60994386471572,3.45337673763245,-1.6502057230091827,-1.8047005649894123,-1.1952034886749687,3.2099351008139134,2.198544749614076,-1.3327194239667177,2.6894752961154857,-0.9879423569693545,0.3243547868987034,-1.1284737785157548,1.6911045975408459,-0.13749087422979417,-0.46641581806077476,2.233819074190392,-3.1615109604695046,2.0177006685941223,2.6222812945918665,3.0586080914243996,-0.6910082218209613,1.0961314174795416,2.4977391285297657,3.347871775771469,-4.913631758901533,0.5431065230819818,-1.5871284342773215,-4.126626639637972,1.4868260333999,3.173657345347307,0.948197827049543,3.0980715792785665,1.7276559495121095,-1.537279892270454,1.152142959286739,2.9615601541938408,3.8483280461903564,3.474674338740699,-0.4207791758160755,4.024815966579597,-1.4935223269631297,-1.6244851722145888,-1.3222920825800986,-0.05729817062430413,4.0757182750285565,0.4687469752544577,1.509035042927272,-0.28687722291845674,-0.31698239078196644,-4.492191677473825,-1.7061018908274417,2.044185050360278,2.4805151874490554,3.4453665572333088,-2.408970090936579,0.09537366261841493,-3.304632834637294,0.5102211371550474,-3.044507905506494,1.8356731227179863,-2.517310352566071,3.4737099738467423,-1.5217285766100794,-0.565743016248347,-1.666486744855874,-4.53314384133561,1.1053780593704896,-0.5420853634350425,1.125803867316039,-5.123525870995709,-4.262565190706714,-1.1652347982305804,1.083094704769433,3.467023654989968,1.8905930141055145,-4.204871316325361,-2.494533924231343,3.1704040584993347,3.372136448224606,0.831498198100416,3.1924009683602588,1.7934850471706447,-2.2594572871566205,0.47082846882671164,2.8592376531292127,2.115862464090079,-5.069657333829065,2.144200317198924,-1.8332883592045186,-2.669039894696011,0.8932728699690945,-3.027188360134122,0.888707714952326,-2.0582126464453783,-0.8205729529772381,0.27683508710208626,1.7195583860442083,-4.953791512908489,-1.127301339690242,-1.2279236699750633,-1.718515753751205,0.06393865499873333,-3.2900241725252246,1.03900719126626,-0.4464911375036472,-0.2074538792792207,-2.1352246044536147,0.9600761159867996,0.9683006339096368,-4.807375172348686,3.6707589326645946,3.2448040004122256,1.3665348367298267,-0.9663812240374104,2.2239423412864046,-4.237719139068781,2.334165578717862,2.4219962988773793,0.8979099889091521,-1.4952462305520442,-2.546780421249089,-4.57812966826001,0.5115341158245952,-0.9147769252097849,1.530550007018764,-1.1239176204931387,-1.703905604231534,2.9941030870938925,2.7190351805425896,0.7346257463652405,1.1905932263695813,2.5600952389519405,4.327102801703909,1.6047897513083509,0.9093195497620279,1.8052988877236402,0.7970672090068561,1.4486582969064221,-1.9214761094963773,5.193423993461131,3.2454044131384188,2.565126765053888,0.7847636736678801,-1.7868600315527268,-1.5181820004565205,-1.0867338655607282,0.616555596832606,-0.15508150767250375,-0.46507212646482315,1.69504719663704,2.214553062238492,3.736648350154127,1.3867824945249605,2.9675875285574844,-3.955819217599775,1.5311549239733049,2.328716335508242,0.01859931709522571,0.0923013429140797,1.6851964875902086,-0.07336092450612333,-2.492805774708668,3.1959818222844327,1.1855508553547724,0.4671389726438862,1.4273798003387128,2.814542917580047,-4.652042890157009,-2.9667687661423887,2.642846160748615,3.712155647935555,0.9838604716999716,0.4020263088772595,1.28516110404703,1.1070393439651935,-1.4283040929471071,-1.4040739307188057,-0.47260396768010077,1.6390144777187587,1.0490071246515673,-0.7545595434696868,-2.466263521293967,-0.9632777782935128,-0.12641350282408517,2.751631459748822,-1.9695640934876346,-0.5549175794912518,-2.773045174170315,0.16755430854961872,-1.3330711380055837,-1.1537485644251626,-2.005616888367541,2.0990035694970914,1.7064449332899692,-3.263608787302979,2.987162559603885,0.8802464596519474,0.5637208772727913,2.0565296820022274,-2.817007614634251,0.7124496976758463,1.3047378628513115,1.0335640997156281,-5.2467075482463335,3.1991737789169425,1.3406576188684487,-0.6811499428699932,1.577639371108171,-2.4221022900032043,1.983087134291184,-1.8781819467005798,-2.6905156921656355],"z":[-2.7586969412429863,1.0313917055254267,-1.5160361716588042,-2.1207000438201296,-0.6200310662462158,0.035311674004316375,-2.2224748420578835,-1.7981806572939099,-1.393751875850834,-1.857518842549408,-2.7025649857725855,-1.9018384620761368,1.1527528457963805,-2.0105877553093605,-2.6189771311062016,-2.9667492245294422,0.07792341409788031,-2.780456879378429,0.2535552390291968,-2.9845040915811047,1.5558832801851719,0.17136543414172475,-0.3843942530710111,-1.885361954179633,-0.535423514286786,-1.1891480950300886,-1.4194855638378117,-2.5658923609027484,-3.177660440594063,0.9703562861712397,-0.5513235243980952,-3.221711476809226,-2.3682684589110243,-2.94791224353718,-2.5648990338497537,-2.1221687212101346,0.48775033606679397,0.338567692910073,-3.243592502593293,-2.998820291256935,0.8826111700720076,-1.7218924938079372,-2.8320886197070694,-1.815891226769656,-1.5688848012562233,-1.6054493452260503,1.0539346250884798,-1.733556363797162,-2.0670116682154127,-1.2319343358142005,-0.7353440034370247,0.9881737005526662,-1.7888306524053603,-2.109693252792025,-0.5703701388348671,-2.014229084318828,-1.1378510432041171,-2.9085219808040823,-0.9732358379429369,0.28042910954875966,-2.181008678445382,0.6535770190740449,-3.114351619122185,-1.104007791255075,-1.8512636191987624,1.8752079692997454,-3.143831078657188,-3.6482711797426566,-2.1430782663544905,1.2391721155545457,-2.374066663517249,-1.2421313710366761,0.48529275450149767,-1.940616942233569,-1.5190651135509823,-2.639118526618317,-2.3446630584373764,3.1255303203813276,-1.679703200756782,0.7259638607884936,0.45682979572479915,-0.17866071926422075,-1.9708261432851835,-2.96603349354594,-1.6393946942488435,-2.2222610730570094,-1.3432510703839262,0.22228190982908205,-2.0552710353273187,-3.780775399168845,-2.3452049989273784,-0.67494682939152,-0.8915688235182796,-2.507604196811931,-2.4278654325908224,-1.9269867076718017,-2.6763096688491226,-0.8570824171834239,-1.3849531609326622,-0.7084651016264205,-1.9728788309421872,1.4875248299618755,-1.7994999073828426,-1.4414289657910626,-2.021228012964617,0.033451749349901604,-1.4835765404870336,-3.1951991806694977,-1.8129869090780097,3.3747033530752666,1.017030981174222,-2.412052320758576,-1.5183623090111111,-2.3929736289034786,-1.740802148077541,0.3909541179791458,-1.2265367042468664,-1.125330491410856,-0.061149635751447,-2.1351571113677625,-3.0819017088275236,2.5281332717838603,0.8658905045515302,1.213784809697778,-2.3361520533578326,1.2401688556799149,-2.896977526214233,-2.6212986337232227,2.008390607285459,-1.8254103806358413,-1.4589519309819763,0.9072056263410849,-1.4018246904926095,-1.5284681444835395,1.1557118218223816,-2.346009790059132,0.4025103818544912,-0.031143257670226927,-2.289231082911352,-3.1180874893804527,-2.343516035115488,-0.5659132405129517,-2.332296001569462,1.3088967045682198,-1.1611890027161857,-2.1871853206806384,-1.897388176979066,-2.4559898913010016,-1.6218829791520197,0.8686410911974533,-3.5189074117544816,1.043122058664567,-2.888609418767136,0.2870755050267524,-2.445563439888358,-1.0686224571565741,-2.4655341268847146,1.4221454311010113,-2.032410622424759,1.4572739693860401,-1.8968923966455344,-2.3452457591564593,-2.8349497049592487,-2.153140505340175,-0.52583803350035,2.2206535758482233,-1.9681517444555632,-3.2798899205640533,-2.389477015396108,1.421917802838141,-2.2865440704527,-1.376043270865088,-1.0826830880803904,-3.054627932491599,-2.746248923940863,1.5863841922461983,-2.1297376720749877,0.5095878788112045,-2.490124361105484,-2.924228968449303,2.0582033625146656,-1.7423995698595778,-1.228151371932278,-0.2898246819422888,0.8779808594725594,-3.1843139314430933,-2.969635597528748,-2.5247459513113015,-2.147992512274035,-2.8039176080546304,-2.0703034717990687,-3.1221831447715034,-2.967648251090671,-3.113900195725619,-3.141006094889059,-1.6881309065513288,-0.4779770176288919,-1.5463217635761555,-0.9180475402433442,-2.9342876950968395,-1.504846962906077,-1.251109347511806,-0.8437532503928838,0.3262447058276279,-0.6266305922906221,-2.6466977657033004,0.5722863138507823,-1.7285996481619719,-2.7101712439046843,-2.0730913087383125,-2.1403700687193683,-1.330733642376423,-1.7125630567616064,0.3278594764398098,2.8432085438795323,0.6253465527688121,-2.3589709927836817,0.741825581308458,-0.20551259236920247,-2.7282344517350134,-1.5286262271926834,-2.489138607556585,-1.4173323233278599,-0.9382735950476959,-2.3707284400075124,-0.8876243194219063,0.4152937205356702,-0.039003054645950525,-2.788901299069579,-0.7587615170088422,-1.6695214117884094,-2.7459658506383016,-2.1708242711000203,-3.4007010991069944,-1.7944913367248387,3.7746213938873248,-1.9832129809763908,-2.6410825305376933,-1.8125675919186712,-1.03944517660232,-2.377276076395522,1.9709746420011613,-1.6615801275788813,0.029990599234335523,0.9073231171594751,-1.0579460577469644,-2.926828861640387,-2.2778367588527018,-1.447189029653475,-0.9506733143050439,-1.1746793781891292,-1.458968269024424,1.9080505559630674,-1.3576885734734294,-3.081705067717494,0.0477979128430506,1.9253170602355454,-2.990577629352601,-1.757640419496605,-2.6950110888376675,-2.958656985564004,-1.8714804483335081,-2.0229614966119485,-1.8505375152479258,-2.747930651322535,-2.418757558258462,-2.8360204658758184,-1.384459585902854,0.7200472228440832,1.0269292429153118,-1.3187451096106404,-1.4966858751396612,-2.835554776712717,-2.0543862709370573,-2.131623107462361,-0.32790370569750826,-2.1939248371615663,-0.7399314375862186,-0.5269452987684853,-2.4150374921468964,-0.2416419226944193,0.27013425759052756,-2.4356242184431056,-1.8285395392469974,-2.215372206997438,-2.3912384669183266,-2.2837351252320977,-1.9820928153024469,0.566266458950531,-1.1923813927045468,1.3754376134686126,-2.6098804986084465,-1.308542560112352,-0.09775133315993517,-2.758536746777088,1.9554259965314769,-1.6329325308739884,-2.4552281527824866,-3.273849374708134,-2.4496443033564286,-0.9114562315924085,-1.363898940570656,-2.932820621440474,0.9150492233055979,-0.7797938199612605,-1.510947946015277,-2.539481203900828,-0.8277509004328766,-0.6357339777027983,-0.03928141050070966,-2.899286017168205,-0.8245945064823448,0.9855910923561815,-1.333227514608334,-2.2653823012581604,-2.1903805808079677,-1.7004393145955528,1.0639699864813141,-3.510903643636639,-2.728305182380747,-0.9799972496938553,0.8882310195681404,-3.591595452258746,-2.775148385709008,-1.9151185256466017,0.7725210681746647,-1.5418731164355484,-1.1436481091215123,-0.5541155120022075,2.0569400386914243,-1.8244181142023108,0.8305543465364776,-2.2709621972204745,-1.5298562399006927,-2.7455587313921828,0.6294418419533034,-2.451649060963237,-2.7534986551236327,-0.1444700007640016,-0.36542470780815645,-2.511828796600575,1.0909621217377115,0.798763902885515,-3.829513525715357,-2.2314974965230774,-2.762715928173464,-2.124626466632139,-3.4939181662560763,0.6811302594080038,-2.6723607885388523,-1.6893098345353765,-1.9823381697171314,-2.1363169030515956,-2.1633954637684596,-0.8330773475691671,-2.5163243628809853,-1.5476377293782273,0.3100025021699277,-3.278938789627914,-0.9335170839428526,-1.6194760800344685,-0.6353153861921754,-0.6036075967745487,-2.427331866640651,-2.493217797816573,0.5548905978967849,-0.10322081005438506,-0.42956756782927036,0.7160981274610012,-0.11595577753428643,0.04742950732899173,-1.8082078126109031,-1.0546976794103253,-3.0030815773653154,-0.9448079708955915,-3.153341401882358,-0.6300907695500249,-1.650206923662535,-0.9253178827385126,0.01855099831897605,-3.4011219994011315,-3.2037750955122926,-0.775462013020563,0.6906075071757628,-3.0943907175697,-2.2718389014527913,-2.667159623411392,-2.074811208112653,-0.409096843285622,-2.1929745644253344,2.6646736206665977,-0.9262433808641523,-2.031888285407054,-3.312824348599194,-1.6953035036871726,-1.5914585991857575,-1.3662736105468927,-1.499331454445122,-2.760838155894477,-1.973396739148474,-2.414022408594066,-0.8609436206967578,0.6504954236064615,-0.3450939935985272,-1.7857703746404516,-2.0803859979464847,-2.4610001311594036,0.816771611844517,-2.382540192488673,-2.6910903991721655,-3.1511706703312288,-1.9141113828513399,-1.5647137499453787,-1.5977348743818016,1.0717804271698081,-2.1801055125567235,1.5712209861812587,0.4557141237145977,-3.162283462643319,-1.8743588947752836,-2.953238834315094,2.743529599918946,1.7385647584814707,0.7572377553636832,0.3626762334320211,-2.72500417083363,-2.7344984636731855,-2.084685387800458,-2.457909336673325,-2.6547022316355164,0.35478375769605286,-2.56300369142838,-1.6853352249295637,-2.081029984571144,-2.1956166099456516,0.0035787664189854476,-1.1518266237701626,-2.659136051702894,-1.7879001392433929,-1.9185502714220188,-2.1295535721691934,-2.0860969762947996,-0.9941611711026473,-2.0694911552755513,-3.3048724211972473,0.08694966402985577,-2.3859960415057855,-2.562283130644788,-1.250255570465165,1.2114821792010197,-1.8150721474573914,-2.9593294946720676,-2.1772530563951076,-2.688345832647368,-1.5216813656071113,-1.520319300206169,-2.380960732511986,-0.05435769333801842,1.1610524821047021,-1.2532577161583527,-3.1804249062811585,-1.8880444094331115,-2.058669239950493],"type":"scatter3d"},{"customdata":[[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5],[5]],"hovertemplate":"class=Sandal\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Sandal","marker":{"color":"#B6E880","symbol":"circle","size":5},"mode":"markers","name":"Sandal","scene":"scene","showlegend":true,"x":[7.6492102637201524,4.5393977583577705,5.028736245496445,7.273910554839715,6.414644893865981,7.446468056032843,6.636014218666468,6.6054799037119665,7.561854389394825,6.258301941485576,3.77916303963878,3.545499579993084,7.463252406477121,7.1738673406512135,3.7372956681717344,7.451888292367179,7.051306964591097,2.9215394365265364,2.1454641174344697,5.171591009245422,7.382828332299142,6.656647340423458,7.355921511497794,4.963168946492561,7.280753187275257,3.8665292846075787,7.438869094651441,6.391214413932442,7.414638253763237,5.152040602777536,5.695777355755837,6.998503124568781,7.0752339322055695,6.029895186107145,1.0216653549143777,7.063288111471137,4.811712986719695,7.40970461982403,7.608088236496987,4.882625786753256,6.5138007209219335,7.012612310170848,7.542145650925987,7.403181066400222,7.031819704356906,7.3980650272084025,7.328114340054635,7.400194532781774,2.64803215341153,7.6616516436620286,7.468698136882884,5.575753449249344,6.587638474940508,3.647284930457774,4.147070185732804,5.742063615467564,6.918038842500168,3.893964644135285,4.622409177750747,3.8244140956903014,6.75120020976893,7.574764287673118,7.357704792079901,6.3561669142209425,6.054498450258026,0.24095806791265145,5.374877423589615,7.415565262642749,7.437137323361676,6.600818637399706,5.695226210042778,7.044376046740198,7.43632659262454,2.7648020306904266,5.699729986428405,4.841903179448737,5.849348612473946,5.15780653860341,4.5793333554118725,0.6298361599383577,7.59711786336259,6.964216808705988,6.647526056418755,7.326846579861856,5.3545882566517085,5.408409891482456,5.939158352860213,7.537623155246339,4.833743945714328,5.09970157042529,6.438020468149046,5.395669043334076,3.7966360494470943,7.348532649859024,7.092220050291019,6.8475894473016625,6.974046359481184,7.084421544408691,6.923187481597733,6.778630875128694,5.537843312332029,7.208390197768708,5.261366169210569,6.531048824812991,6.150723990805106,4.593993937175583,5.603256433377094,6.395251636300953,7.49871097996349,5.933075731324835,7.3059941168119416,7.12613016692981,6.585907977909473,2.2085205283635605,7.866274717798746,5.156213815880074,7.380243884516823,7.269527255883616,6.619551684007028,4.121632752579803,4.417962689455431,7.710595811404745,5.552568397810793,4.4115045887664674,6.762082963488112,7.044976666143619,5.786230882312336,6.66144015180332,5.1585850712246755,6.002814990526822,-0.10491790736994545,5.880880807979243,5.455301082027127,7.805808244631218,4.937456959899228,6.352013707236784,6.843223875356963,7.453789608241859,4.97770632279722,7.167673687709649,6.3698329306417065,7.441679861352247,2.340307193574791,7.008508191387084,7.40997052645795,6.005136074589498,5.146767255749151,7.15513916948608,6.497749411614751,5.128865057294895,6.659175001741716,5.153446184207612,5.2486157171384225,7.128891691516836,6.044956993630718,3.8344537576364686,7.095097229277172,2.3130675084225847,5.253189469813487,6.158804159351726,3.9814877056923725,7.717707341535028,6.673338849617829,6.948954899594596,7.136573175352725,4.714516495891352,5.802995694412724,6.884604736596978,5.0529880196677865,6.301421181150906,7.020603539463265,3.9501606583993376,5.24420049697317,5.179553892667772,6.398770844085587,6.311170966317957,6.162054483430671,6.2905322895968085,6.273161922972661,6.35654029792536,7.529409405685937,6.787803677054535,5.485117500404774,4.791245371216594,6.42883535650799,3.0820765563788677,7.285809112408597,6.160425963347871,6.236811372422265,4.181652429167056,7.199043931680416,7.606089085162544,6.916739059977913,7.52992606656384,5.748641616835872,6.400105410055655,6.287488422254545,6.669567828876275,6.9712891568122695,6.481818635234169,7.223339702559106,2.4592412854855894,6.715308257996464,5.545363212397473,6.8537027408119435,6.435745752572168,7.292233482014411,5.616261556974713,5.338579242496101,7.22716467034096,6.8000413106830235,2.703692603010085,2.301723423285103,6.296042785921388,-1.3340537124056344,3.3412534094499744,6.029068332592606,5.189974821622179,7.395682287799503,6.285015896906433,4.865933246953302,5.5841415412211415,5.785374090926878,4.076021899886099,5.790889757310047,1.953465389199158,3.580719734374446,7.323148427413421,7.302254394982275,5.609138444633723,3.0948303745196006,7.51938257995594,4.410697756350891,5.545415282224659,7.071684192610018,5.633209071058964,6.148759349107844,7.564246888609531,5.715523915910872,6.3673840604840874,7.376445081722434,5.694418834278582,7.250996714092888,3.7268248558612087,7.306958730601116,6.899460224542867,6.755505311022453,5.117943778363183,7.1101002990550395,7.010930407043814,7.618491997029709,7.1110909379120395,5.902827914651365,6.354101052061416,-0.5426590990821094,6.546177935782116,5.702517627872007,7.357880635319714,5.663423564350617,5.951247653798905,5.462635040678014,5.471396999005207,6.27825627239448,7.601338295670603,6.333413936874594,7.553415133264302,7.667370865746624,7.42251055591433,5.510887812192756,7.272310683219926,7.1599181250118304,6.820653105488143,7.19775888756539,7.463415315566351,6.988755082219392,5.301878526865068,7.162574822644294,7.010714562528347,8.045088913244475,6.317055786898392,7.008270430523219,6.99511012757883,7.261999864020012,5.074419834824887,6.492274185922699,0.31432313798806333,6.17884194841114,4.979808064088709,5.77274393267899,6.3921029080076295,5.152045952979967,6.018274287452899,6.73210237694235,7.052162415840401,5.359409175500861,7.648487580807135,4.040297579487556,5.866352566241328,1.3867234872481313,5.174141643356923,5.784786511240683,7.0623024373405485,5.664350567735068,5.810659430098038,6.212430248019579,5.629750268307197,6.120389440524442,6.397091053202016,6.572835003559726,4.037858114403459,7.574625953089814,7.448014128375063,7.315823284962044,6.936752691893225,6.02372165827281,5.44937125439481,6.722781730812206,5.819845811989801,5.424033557114789,6.549768830373376,7.1026002671344415,6.701474806009418,6.284588384536828,6.790545470762917,1.436765310228967,6.340557705641454,7.285750677437736,4.705036717722479,-0.8636831562546927,7.0534231284481645,5.487800109839042,7.6966816533376265,7.328248298080837,7.163204767648563,6.872609710409592,7.326084831127665,6.059131311680432,6.335820264961546,6.229480283250137,5.6458382110784475,6.737751693387823,5.300454223130875,5.503246346237461,6.82495751302242,5.683062046640946,7.404641160352614,5.255276866375174,7.104443930564662,5.183197929314132,6.085485922250334,4.975044122948854,5.930062479900279,7.270733938874106,7.244947212173751,7.424638736014722,4.357999267694492,6.904085466414974,5.718799608402457,6.815858417279107,6.971750161068065,5.617250455814598,6.511624993526635,6.479804348037382,5.940650484578575,7.794932018389224,6.933775319557526,7.421075399363859,7.459617748015893,6.229254170105579,5.2790780123221825,3.5400485543805638,5.641791488967426,3.327223844973143,5.910496662551282,3.119618969531171,4.481679047585592,2.4264588139642154,6.956369278503419,3.9397255910280764,6.142009284815725,6.01086927993506,4.303966062763957,7.305214280284832,3.717559057079861,6.882055510352588,6.93713408106015,6.311930747810928,6.878737899490211,7.2493653212731735,4.446993193612397,1.876436785962378,6.26289878739555,5.871780536817822,6.458489690036877,5.604442525576017,5.8169502547889556,4.95282814982846,4.965057250030808,5.720387601367564,4.43544401324312,5.0946418410383965,6.338100740310073,6.480552074054838,7.6505294687276,4.36326382457025,7.334967851192014,1.4527076123683293,5.6405740274598966,5.484041354471383,2.4070886408375314,1.7506799769230794,6.115091209475505,7.604352631085181,1.660066974520724,7.447475230868894,4.847170205566616,7.288806265262625,4.995826270266529,7.030361604908824,4.452259381598219,5.288449164908803,4.947733527594252,4.73955465122324,5.373023968047658,7.538773791052016,6.918918669530801,5.274928206677835,4.939075993229571,5.5515655811107925,3.7844112816489215,6.667844389522724,2.8553198323526296,7.545955504305665,1.6549582899398272,5.588323749173014,6.3794977223514,7.028541086207198,7.526977809254541,7.108977625566434,2.905965077171517,7.443494425311719,6.3632533906010265,6.165681873873668,3.6502807095452883,6.288109297017961,6.736720217532339,6.894176883503119,5.615405437770628,3.8317561898892114,7.744243123599604,3.9035769457476124,5.572907168905408,6.655007498611809,4.89253187740365,5.864834224484382,3.7035084895959547,6.412410496215177,6.7200327853744755,6.528339582780891,7.522171946874764,6.222626817820318,5.570713303571231,5.628269717795552,1.9706245329570737,7.214225366347204,4.057746943657062,5.992618283378709,6.956029054787998,6.2633176334106055,7.408116010729593,7.182459362747215,5.257613875345014,7.124021303285553,4.326903774542585,5.445110864411309,4.583530334186939,7.764797347705539,7.465846446460288,7.289074811596572,3.8421593255005835,6.936183754309508,6.754078840549944,7.697187991725349,6.0361333425546215,5.268663605850191,6.295326870843321,5.513124510945149,4.779470823702357,5.8763982395794345,6.685411925981023,5.030420408214418,6.9717791605663395,7.738154544743911,7.976187058026856,7.287221210749649,6.638777141729232,6.859875241071491,4.276127274708751,3.646298834193697,7.365440579716821,4.527598436702045,6.266801055094969,5.657848873721691,5.636388421488314,6.231351383406901,2.549758012264572,5.709530955176515,4.282471052832874,5.733509257212005,6.566423600083887,7.396921110177038,6.96666535577679,5.731128873441649,6.610765229264487,6.892839199695576,6.1050299491141935,7.0520396234533855,5.019246797153968,6.2352550160185976,7.382112601023086,5.386386671938395,7.607680285531991],"y":[0.9170600998060504,1.127564237386282,0.1806770731830749,-0.46416792010619995,0.5406800438013418,1.6875347278130883,-0.0527946871889083,0.626253420789592,0.4999780800443631,0.5954835345791731,-4.3236769379152395,-4.369437734262981,0.6987099244594247,-0.17734408751674602,-2.571908338110395,1.6658625755323002,0.9544566926862983,-4.664942920503377,1.427615592653871,-3.620610492691191,1.5081890851283917,-1.7135190268049236,1.3528791150696633,-2.0925391969016593,-0.28946200229135916,-2.257770876255713,1.6947008268607529,-1.393566000788156,1.357990829512307,-4.661436954422507,-2.3798955136671096,0.5379850882707887,0.034007322671110465,0.1641482617621602,-0.6595366971667832,-1.1023909717949318,-2.569569491097139,1.0509025632509386,1.2273033568217873,-3.1341075378611962,-0.6452446332384345,0.03832236395441617,1.0530970887075055,0.4207524524345583,-0.4119944896158652,1.9791782784112646,1.0015706783410563,1.5830264490765926,-3.701061820897913,1.8424370222019606,1.0001942372312798,-1.0639712332563362,-0.9583803494885524,-2.7050654289074663,-1.844570753648864,-0.3678857361229418,0.12010493931601866,-2.056452802327939,-2.2113382450499683,-2.955054077048112,0.2651449491634053,1.2916983392839454,1.4212827453378452,-0.8564903808674608,-1.0644744393811465,-1.9173504215890762,-0.1984534575876611,1.271278493631479,1.8472418993809196,0.02053415653941272,0.5172574060046478,-0.18367935086690723,0.648782009258618,-1.7953969971564758,0.2302237047768122,-1.2691807420160972,-1.0294717415645838,-0.5054440071491888,-1.9712615912531655,-0.12501249250025986,1.798435500811946,1.3905703644133116,0.718849141980555,0.09397913083252109,-2.326040724422426,-2.026205788456064,-1.1363543655647734,0.9116512629557164,-2.9785959505553863,-1.2478460101687314,0.16695992187066566,-1.5866366248715484,-3.6541065327093287,1.6600863563411532,1.1858456930687173,1.869466094973609,0.13327429185881481,0.5833828312307595,-0.7272143615667986,-0.8742668823473005,-1.2103347709804535,-1.5547995827958532,-1.6686036959116417,-1.5032177070143764,-1.5089079775088459,-0.13395563861059331,-2.6482045669452474,-1.1169268423577037,1.5457624687865876,-2.0695029087249615,0.6853557723466703,0.5994248426658311,-1.7908366723276772,-2.632612154252994,1.898268695802801,-2.288206999118521,1.6200760370194205,0.08684119852185453,0.7537197369238041,-3.9824652919371277,-2.1091711905905015,1.1758333037152513,-2.2341018239728725,-0.6021501816697619,-0.5126590326788327,-1.3171230975657904,0.6758017729705867,0.754337099586733,-2.7372147946302348,-2.810617460822453,-0.4910569368371855,-1.6120787272484711,0.5500512137305388,1.3705765688500788,-2.5265869799353085,-1.5603320320254923,0.9132232994870252,0.16447340851054393,-2.7179743791329347,1.1437696454701178,1.1307393047652057,2.0255011093023434,-3.984431856334381,-0.20989225616004115,1.3103263241449647,-1.0207249725926977,-2.532238549938875,1.6862245488309504,-0.169261630019982,-0.9474371322687689,-0.23869344572888154,-2.9519303741252987,-1.7618304434887144,1.3151721904155782,-0.887125397259297,-2.9636257174146916,1.3078061410418111,-0.8476318138716312,-1.533084036004515,0.19199669092169838,-1.5238835066658698,1.067616572694821,-0.24663971501402543,-0.8081505994718186,-1.8224884120999414,1.0114775033379282,-0.39505484042156863,0.5312385236753602,-0.561680485190187,-0.6065705868325106,0.8468682090760786,-3.8075843837455654,-2.767737013443996,-1.5524536947593277,-1.0153426151951062,0.7938492276730404,0.6165038676688549,-1.8288966372566344,-1.913780777716565,0.4567291669203424,1.8019471357418382,0.5598579550227227,-0.6424263585510912,-1.6283129419824522,0.02175196836778366,-2.9525831396697355,1.467859688477151,-2.6196354063101532,2.16774299124382,-3.0897691978471653,0.95668859725304,-0.7961281454494318,0.6921985886413946,1.6471827168697701,0.38107377205969156,-0.0750906869426518,0.1459904563759371,1.726146934260887,0.10456785885528291,-0.7998699345517909,0.9213217094596916,-0.42957523073486437,-1.48883939948521,-3.7167940692510784,0.07912498912131824,0.46460688117982285,0.4583660510350378,-0.218799066189182,-1.2437677894552672,1.5475983759987224,-1.3606210997884776,-2.9315649107547777,-1.4434045930066512,-1.3614338781749225,0.5129438005777376,-3.2221667297185754,-0.614164769770355,-1.5657070302689915,0.7048043454909775,0.15915478579812148,-3.363790878029587,-1.2040827514261625,-1.391184158638774,-4.082901957624289,-2.073683609017405,-0.8196316672538466,0.254006365274764,1.2251322109902358,0.4373105183817031,-0.14841597241871024,-1.9334332160821688,0.41641231773697773,-0.5113656528407203,0.6619593463871097,0.33718074196958736,-1.1370318254525096,-0.9129801525831858,0.9709190924352794,-1.5083722602618843,-0.22309636110717757,1.3121776803127343,-2.9267532648510524,0.9469224106130405,-3.8100030658116046,1.2824666326948468,1.5027190662013599,-0.8218807478697842,0.28457573215194654,0.5367580798140308,0.3395085857375568,1.8933404321156126,0.39817221745836795,-0.795848937071022,0.6625510301629322,-0.5902863750171622,0.04012992915297357,-1.8835959629564871,1.8515395624979931,-0.058089676512118785,-1.1795358490566565,-0.6570556194884202,-1.0827339709645274,-0.04037617291033491,1.2897132015709396,0.7401714225784634,1.1962204793569833,0.8239995222980323,-0.8738982910487588,0.6453353011603223,1.6113627512082476,0.6112844522165927,0.4423916744007643,-0.17619893757093474,1.2396343357034205,0.6865862625331118,0.38828696235189525,1.582411198477449,1.40375736602819,0.3468738920542833,0.5615701666387871,0.9359633313240655,0.2419085899292631,1.1076109959272136,-0.27057489864033274,0.02970079606087707,-1.1980425139817796,3.9629505475083517,2.721867338364142,-0.5348900560845111,0.11428320083273662,-3.012300252019513,-2.5605220403275997,0.9785481647588643,1.8510430361334052,-1.84910098374654,1.347939376125245,-1.386496427488206,-1.0754082276806831,-4.7346998516888945,-1.879678910413063,-0.5507300092493229,1.2607642464166076,-2.3158912382418904,-1.944985606835312,-1.780997853171012,-0.48006720814258985,-1.512727802716094,-0.7594117638903126,-0.9213579573587081,-3.6486877332278422,0.6308319586734421,1.6420010224732633,1.2676363827215333,-0.9607140776776804,-0.12838309632588438,-2.4624030863326256,-1.0700792294507113,-0.46779843388661385,-2.557869276453978,-2.39077372154639,0.7652476637322291,1.150386507830431,1.2116660560200596,0.764863978926637,-7.022794757858491,-0.5679860952767134,1.7931432330802772,0.2075467529097408,-2.2769535286130407,0.7835147765396978,-3.071034652747007,0.8735012131236322,1.3859858567918042,-0.11255033316938097,-0.18779417253858022,1.3924203849654733,-0.43050444005799665,0.18025746369570247,-3.5746158368225585,1.1712662887852743,-0.6310360700442372,-0.8782478054390829,-2.2945068341624575,-0.33613190258978226,-1.6326893043664445,1.245677439786076,-0.80604675386263,0.7780297362880492,-0.03563192118629446,-2.1406062344401464,-3.5567582013126544,-0.19242422458547023,0.7165280657006264,1.0122135702728068,1.5949584087027897,-1.9264238966329952,-1.5647962842492344,0.4902874088045105,-0.14624821948499367,0.2720724415950262,-2.2064510564623925,-1.8526554687611632,-0.42051778663007455,1.2581070837800539,0.1022778428019791,-1.1898763846996911,-0.707387988640104,1.2156157451887133,-1.4756840864481202,-0.7153471819489954,-1.520854728947533,-1.9224743323724456,-1.4390626630461925,-0.8481634294363266,-3.4646684913311123,-1.563993452124374,-1.3486266841086338,0.13122895129443865,-0.664214196443692,-0.14195779184589474,-0.870545849985208,-2.0303018214463076,1.502009315487066,-0.5925924202437712,0.03891702869823287,0.0968266375263388,-0.9189770025166437,0.32167919701194075,0.5265827338042834,-3.8604623153487228,1.1837300347811675,-0.8616444985818227,-0.8119207795094009,-0.37668986348018557,-2.6480595300682372,0.12388680101723479,-2.09706765617472,-1.8150020606564585,-2.730125665916395,-3.5597892879204314,-3.273857727573848,0.15650137385487872,1.655583020851583,1.5190722290340561,-2.395511899210921,0.5702847181541401,-5.667289935244825,0.05420878980610692,-2.1380953920926897,-1.2412521118822601,-1.9382873399515712,-2.443306408252507,1.6996891385258708,-6.084217480521746,1.218693000781634,-0.5336803846265876,0.7881276874090273,-2.518406854357418,-2.2354244181999405,0.13966162233244842,0.148545396126929,-0.2443613956597102,-1.7881102931273625,0.43340731135842964,1.260946748859524,0.873257716477855,-0.3666586124525665,-0.34698115608393004,-2.926024189736401,-3.010931214552708,-0.7119067050872109,-2.520438684246168,1.9318054733840138,-2.794812639198199,-1.902994600491958,-0.9371643723475799,0.02422303542761185,1.3862706240142544,1.0059998079325787,-2.172985992085799,1.599488954475392,0.30796069584914937,-0.2822266247315324,-0.02198313124557779,-1.7508361381730382,-0.30936874496172817,0.6990737870773819,-2.248938482550682,-3.8791136822642587,0.20420588246851634,-3.6584005963572404,-1.4686046518394371,-0.6026267513710822,-0.7865289938067894,-1.020835788957757,-3.012941111656433,0.06598628538541801,-0.25033899560265593,-0.1671042226141944,0.7492384799573946,1.2949294605357968,0.07176668051369828,-1.9515212020791732,-3.0317873854513473,1.5852284740767164,-2.6324731865137747,-1.4334843139713844,0.5026741166971703,-2.872645357796798,0.9490069582181662,-1.1093644094237431,-2.0562888753994812,2.066536050623301,-3.767815899332706,-0.8820126661232024,-3.043120817030899,-0.23470299824645563,-0.6230680402003607,1.8035747923068983,-3.0893092140109224,1.1643604834374242,-0.3680925844525841,1.366580390811402,-0.3433968773700979,-1.580450685017195,-1.1979598052327731,-1.176020660124223,-2.700701591258566,-1.8149105187699035,0.8784397761533436,-1.0800159694671936,-0.01512733876376178,1.4231948138110102,-0.3148263157776648,-0.03157358778659673,-0.44448253948631034,0.42766370317741753,-0.7689605837421535,-4.192640239226783,1.6113814568686176,-0.35376370448332145,-0.8936167938779862,-2.06774364885834,-1.9966676340514113,-1.5822640667107752,-2.2341682704190697,-1.0782401184234596,-2.9179579192651937,-2.0150896512546166,1.4116928145790884,1.3028612138927795,-0.149531485358566,-0.44010528818401823,1.4981973564504196,-0.29489782335176074,1.1572970682615993,1.5083814518915915,3.964754853939217,-0.7086985940392636,1.0789691865635733,-2.042398156083601,0.782863283879525],"z":[-1.832377433152869,-0.32292413895751604,-0.10441706842579838,-0.4390827888890666,-0.7016509989699958,-2.536746295077321,0.1600536226895769,-2.0048054119729444,-1.1762115629273604,-2.4397490694090194,1.2102391202686846,0.8667290777763228,-1.4037885504871908,-0.5007916397099818,1.209469263386138,-2.3420911681755845,-2.502194765892275,1.6026004107081604,1.7140442004396432,1.1501777365244374,-1.8782299054931229,-1.0717648383770781,-2.2871658008141935,1.1187680848486592,-2.321746391783862,0.10300774544735679,-2.512235278961192,-1.0180653324714204,-1.9629912189380123,0.5537421841853122,-0.621376054088637,-1.7874864031316462,-2.1950202154915845,0.5136394498680534,2.427313242062886,0.14937209222753633,-0.1266728000794595,-2.088710481672289,-2.275658457354651,0.2374082676555822,-1.385569108121922,-2.086140462255949,-2.1416116033381916,-2.312659954559276,-0.4974561251542288,-1.110252813795355,-2.3366155780839475,-2.1828898871188316,2.5397923515588356,-2.4035133034195715,-1.3982590031404394,-0.54980541471219,-0.07177343261182172,1.2640016736219084,1.1233210210987339,-1.3415690379637268,-1.7285277798552914,-0.9510603815342894,0.5007281772432814,0.5713744857799012,-1.2146506072109071,-2.3489066275331942,-2.011365534385625,-0.47349574304780745,-0.7582503531401282,4.6924076833769695,1.0770809464406825,-2.2911565765754194,-2.6051107289138042,-1.4546613684721652,-2.4990491792894387,-2.3000057211963125,-2.0719847492678767,1.4359550633822598,-0.6617144329304231,0.5994316851281886,-0.24778088573639095,-0.381081089063072,0.9053183759107063,2.916263476008103,-2.499541940742101,-1.3746532208211937,-1.20363546963992,-2.3666570196037706,1.2465008029722848,-0.6539135395767641,0.013144154209077573,-2.048333101797628,1.0141663040633446,0.3728606850753077,-0.844350992636769,-1.2867815942930267,0.5126837416274295,-2.549802000663403,-1.150449742931181,-1.0183300830080686,-0.9823910291383245,-1.0459947059745913,-2.3597287295710014,-0.24646655066504722,-0.06391017035181118,-0.054187458293966456,0.53717803498351,-0.9198242788748295,-0.37439140359519596,-0.2222823818221385,-0.4087339788058458,-1.2220298639169282,-2.0681536217865117,1.5860037455244989,-1.9313147070273666,-2.1440947531482046,-0.6116788155954805,1.5594851232153397,-2.236706087348313,-0.262667981359457,-2.4358746104486086,-1.8886421083916218,-1.4994164969098978,1.3962280549798136,0.7301209296561474,-2.837814095895554,1.6319753529115455,-0.26858446854617685,-0.13178195178499622,0.0615726442549418,-0.21719637709188092,-0.877887740477301,-0.4594975506569308,0.5090324649346901,1.0529633775399772,-0.5835055272383527,0.529541937583065,-1.504052291961947,-1.0115407589590697,-0.0544751162516556,-0.8362365253240974,-1.770585034610318,0.25313622182863715,-2.285726037548082,-0.40411579070497167,-2.8012076231623286,2.363401711192593,-0.4540480077404742,-2.1842083762644373,-1.1454006860747312,-0.3391672645873887,-1.2031976329607628,-1.5359012274080335,-2.1865879599383775,-0.7968988444549209,0.8475512445293996,0.1489958614463769,-2.4741609713014205,-1.2445025325253356,-0.34581404651926784,-2.5082794362355965,-0.5377973087330655,0.6437701009189676,-2.002232163010591,1.4966414189922344,-2.1238921744120063,-1.7797927292814104,0.03383991036302573,-0.24047385807276317,0.7342346122625654,-1.312540793348061,-1.7069146785982008,-0.5082353677409585,-0.9712116672136145,-1.5469274503239476,-0.3756728538245895,-0.8572187556339373,-0.25487344678640933,-0.816511819679208,-0.6835528260556188,-0.6405702596111719,-0.6961513062201519,-1.2690875899688436,-0.725205067304856,-2.4280098407545183,-2.919594870462283,-1.3913165682252018,-0.39785237528814077,-0.20089114972528965,1.7828653433504966,-2.041930735998375,0.19869315450114552,-1.5656670236285466,1.1794556924102515,-2.2382458166870443,-0.0763671496548181,-1.7161011759956912,-2.5349345584530805,-0.8380169591717136,-1.42500165204413,-0.528480658867786,-1.1520822226798129,-1.6441354640324157,-1.9211830048418916,-2.0879798638655807,0.22399276941627716,-1.6875205012789098,0.6263964482163484,-1.6422280896948174,-1.3736401820120459,-2.2592635153497196,0.4637805183677095,0.7776458489442892,-2.0306317814531907,-1.7634225874482132,-0.00865573367927723,2.5799360742975384,-1.2614487608236393,3.994294964637762,0.5380845856645894,-0.6952041796168,-0.055003036406960845,-2.2851541415708185,-1.7686134568907128,1.5608549136048828,1.8218574106607595,-0.2503146390988595,1.0215218384863225,-0.5432094720193698,1.6098199999507476,1.1925848401910426,-1.7947793948930502,-1.7660368854923267,0.6179606453780263,1.7720568789481455,-1.2904345031677125,0.3608088297620483,-2.540626856578885,-2.0106541473797725,-1.1378751768672921,-0.09788798496444553,-1.4896511273661281,-2.2189369961122964,-1.5401523879512946,-2.3787303274543805,-0.1846174646619146,-2.5134002904291637,0.16335770129581123,-2.3194200583287756,-2.488777435198499,-1.0419788659986657,-0.883082805123199,-1.5781711255155804,-2.1108529540010466,-1.5517686158320574,-1.8827630976283167,-1.6941601222611757,-0.32386585843422117,2.7642161100782543,-0.9947183180371272,-0.4385784790051178,-2.804275157874547,0.09261584206119833,-1.1295929065753774,0.448525829349025,-1.047851309577438,-0.6085457321273483,-2.3566812316846253,-0.17609247352782895,-2.200100520514323,-2.1904996901882696,-0.3202804850171929,0.42229942383221336,-2.515574260639568,-1.9326964065036911,-1.035749533999553,-1.3234473398227262,-2.1619163586361583,-0.5229114436797244,0.8819357929273322,-1.2100266088013125,-2.4559834190699696,-2.149652568922286,-1.719233564440872,-2.496484334778671,-1.5641694008571494,-2.321348881730982,0.9094239295752883,-2.4505664320967786,3.3266797836871276,-0.9250318651766191,-0.09326061016453321,-1.0459065717647311,-0.5342163314641236,-0.5541967882196598,0.21091167961791632,-0.7016062644049536,-1.502324274329806,0.21939563685857963,-2.1095910066343455,1.0297063034519631,-0.896855784021717,2.5314568365837875,-0.2609549697277541,-0.45905089332098076,-2.0549583006509007,-0.9997888549052592,-0.6658706781097022,-1.202218125344002,-0.7082742603444286,-0.5354919949581404,-1.2818994314247187,-0.952893164622037,0.8605288742996383,-1.5565306753210517,-2.260634063413268,-2.152379853813388,-2.2802788769562943,-1.2594385351635,0.4131507647316066,-0.20011413790846677,-0.9155119922056868,-0.3525820342188847,0.574845086988689,-1.0771539174038518,-2.560010225964695,-0.5856990214560086,-1.2597404306110855,0.4976927658486846,-0.11995285796197842,-1.5416600519725379,0.07873638010533686,4.557631362976151,-1.5162702397364682,0.29843672283743966,-1.5711679457521865,-2.311832650562632,-1.6580939496498444,-0.6500015596957837,-2.2876092683696205,-0.02155058111173273,-0.804865747269914,0.7623504605647631,0.18519746719981162,-1.0058177843034986,-0.3610067417352428,-1.005102994117528,-1.3161613269124015,-0.2318443349147058,-1.9663874908036436,-0.49520792186798473,-1.6332484331696593,0.05560682291676217,-0.3540926781745618,1.459329746844187,-1.90011799641441,-1.9610521704223727,-0.9088106013216346,-2.379495712316165,1.552403039171801,0.18135611929685552,0.38046935015736927,-1.1361818010072784,-2.2168104321642494,-0.25434851777704853,-0.918886876281656,-1.5034720586050743,0.38658651551489226,-1.5891190622765587,-1.0128482175880495,0.08524479902584695,-1.7656673266354803,-1.6050415098312987,1.5063822188334703,0.6667323098288135,1.0450587982833721,0.978153515758186,-0.20143059319086162,1.5513030471240696,0.413634673151305,0.877947483677469,-1.6369891816937325,0.5689216664895183,-1.48609167398987,-1.377792945617028,-0.4667918889311541,-2.3232457723303646,0.21136506218946705,-1.240496467045388,-1.603393175277583,-0.5426698475819531,-2.1788628041928293,-1.7491738055915111,1.8112841659178103,0.19486547751912903,0.7916307703995505,-0.7026825498693168,-1.1486109238725737,-0.28111695396587705,0.3922353627428091,0.7184072015248631,-1.2954501205336681,0.0666285195253361,1.43520932182088,-0.6483197905535005,-0.9091061396451382,-0.7505543694795676,-2.6719622307127833,2.589357301317718,-1.9235613223616082,2.880473433162441,-0.6953422144100964,0.5805217525071978,0.5818176536032031,0.5390226821360966,0.9104514324919734,-2.5284233497993536,3.0617923452062312,-2.0586376525878904,0.25282562101296757,-1.3300462480119035,1.6893032237459584,-0.17282954095778139,0.1342997392091417,0.39373485574779515,0.6505373447647642,1.1327986753170298,-0.6920638511197478,-1.9493247285008393,-1.769119672856243,0.45330569994991715,-2.3772684650810323,-0.32232175075784203,1.331243164955142,-0.9197247477785186,2.026436423240634,-2.426856557970438,0.08657879016607606,-0.8834844854861447,-1.542343982979866,-0.9669571851340047,-1.6015322935533778,-2.3482853362270264,1.7780899214582386,-2.381086580848598,-0.24017603985252547,-0.04984991270935588,1.1571058677448207,-0.4892691241240613,-1.8939209582913261,-1.741023028045976,-0.59177165238243,2.33022300598883,-1.21124190868792,1.4154821278411933,-0.5718523752234479,-1.5416504295243474,-0.9134289606284453,-2.227810803735629,1.7403943473582526,-1.3495365149935536,-0.26496601309492346,-1.3629300250596268,-1.6118073052123756,-0.9436669867776505,-0.7406824559176846,-1.4305076808832498,3.581130596268757,-2.243345440218411,0.9941127139117879,-0.866684810814097,-1.614166358571919,0.6302387364451919,-2.060306667536268,-0.3465301141716171,-0.12166239418303433,-2.3511167336505787,1.3396379970588534,-1.2474715400889909,-0.5995781961907553,-1.9720254838249163,-1.8377397435304033,-1.6546327210951157,1.3737603907885743,-2.4537374322988783,-1.9139536920379394,-2.216146383887469,-1.6590196702880189,1.0211423700218099,-0.7370824434356249,-0.2931811513593572,0.015717072630410514,-0.8564725506774779,-2.450916498882004,-0.2331008777307254,-1.832106627074877,-2.282280393753573,-1.8526887336038784,-1.5853603915465928,-1.6248759203376444,-0.31733650663195934,-0.4000090465131982,1.876896412263695,-2.499220306470547,1.4157044107262058,-1.2950521613868589,0.8075612628401497,-0.03873471794766188,-1.5892751562222862,1.043292973978792,-0.925425900665756,0.8098701317009502,0.7697661948647279,-0.8940719714163102,-2.4308270995519594,-1.7619392997919863,-0.5189388653936111,-1.3764095813736827,-0.6489755026314922,-1.0702763175108856,-0.6502204451454692,-0.42177039971859254,-1.6468996927273154,-2.130055825076505,0.6731698721548374,-2.217059528218919],"type":"scatter3d"},{"customdata":[[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9],[9]],"hovertemplate":"class=Ankle boot\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Ankle boot","marker":{"color":"#FF97FF","symbol":"circle","size":5},"mode":"markers","name":"Ankle boot","scene":"scene","showlegend":true,"x":[1.6686659268321062,2.4629940683928107,4.112728453119189,1.2206053846917468,1.0509718785276239,-1.6111545219700556,0.46888383674973616,-0.06646065726049265,3.198896566291239,4.4559352812281245,0.2910151106470159,0.5990891847014146,3.3180010376906406,-2.6440456999833026,3.47918860241307,1.1845253128896036,2.1804816073560387,4.110675347007868,0.7571125436860615,2.6694233290509604,0.9920525121882894,-0.4416986921247818,3.530043443748299,-0.8718591858402982,0.9920420227427401,3.753439931962012,6.093969499668267,2.612171262611603,1.0487387901158005,2.711551720875474,1.0828570452472985,1.1163115460760724,1.7768140453424197,2.4880996363802996,4.572870439836296,4.557015088685627,5.569320988450314,5.816023273118128,1.5903449903687268,3.766010951180912,0.8594506440175481,0.20444841594903557,1.896128756231316,-1.1989935095939783,-0.08424833416516793,3.265386589383778,1.811827250962514,4.566758642444744,3.1716080563327935,6.366577305644334,4.063869522824898,2.713459368085248,0.8288892190260575,3.804055854852226,4.208293508763111,3.331780206072146,2.7725474991419627,0.4080493374720453,2.022323363762382,6.052657577707504,0.7642097669807366,3.643426862622661,0.4269684928076725,-1.7124354187197022,-0.4227230593724482,1.6328471214574292,3.582672111722848,-0.23190150782894053,0.9590831856437863,3.093696343890455,2.569141871480773,-0.724329019636767,3.514372319093845,4.078679921840285,2.493237093840772,2.584812652017007,3.157449333676413,3.957244676451412,5.092707361684139,2.3963864648507616,5.278012019905612,3.4464111470683236,-1.0799769787634341,3.803107682603917,4.332588354969417,-1.6651154672253325,3.2680070350194113,1.5422462015901355,-2.7021089411479595,3.976119596033068,5.440501121476657,3.631650437022109,1.0900658048384397,1.384238845413061,4.415974524531602,5.509890750335535,3.6992979309640033,-0.7511285609431569,2.2186745486817863,-0.6523725244488885,4.1796399045413875,0.58303685495415,2.1702676014231623,0.2571181238226729,0.17552467532831229,2.5752255962409896,2.0839809809096455,-1.1447713375615904,1.3201501396784774,1.8453295278788189,4.178395907192822,-0.2078293552989279,2.4952060944561327,0.9189650084949463,1.9290960295261517,1.8326948485637582,3.285161827438188,-1.0175178536616938,5.149082201195137,3.9304516828740184,4.849809171544965,4.741270094710218,-1.0024537458670855,4.428042335161464,1.5447909067998369,3.0608576764106474,4.062832815489493,5.002358086671606,-1.926927703016611,1.7769073477869082,1.130683997279782,1.3349619752530135,0.13216239828077245,-1.2423932829268312,1.2450812887461145,2.499085537006245,4.987676500233513,3.333199197240056,-0.11821276312163961,1.9854462566420958,3.189023213055253,4.694496736082064,1.5966996163458014,-0.39183636303350533,2.4591462428838398,-1.0459445533109373,1.260745464709822,1.6812988479960282,1.5550897791807947,-0.7306418522588127,2.3811237407604486,3.5163845475887316,3.8967577468273733,4.491583979317584,1.705098375153136,4.273647713839318,2.218082615777828,3.3171404738561425,1.8128973472011165,3.5529868981360964,1.783036586127376,2.2795849398499715,3.780725758882883,0.8454615861580358,3.920823910573825,1.9110126479268725,1.1377769189563258,3.2772638242765115,4.091540652529408,2.7506354247723297,1.6001705777698532,-0.7962630370669169,1.589832797578624,3.9516866808737685,5.168160272780445,-0.0032161635416040005,1.1573479703674217,0.9801053524791773,0.8360250617514914,3.936732401108526,3.026300801432619,3.1144208737804786,0.23911661421693672,2.8179271582096357,-0.7371125306548678,3.259721190698266,4.196007525394722,4.379256602687089,3.666365407813822,-1.9245700884617496,0.07558950197052913,2.949185833315533,6.5720277116720895,2.795843941100762,1.8038681877533504,5.540172608245862,2.353692190100884,1.014730071477675,-3.2164405943737107,-0.21098609066108012,0.8559382113775691,4.908996183077086,4.830643176391576,5.257905053415664,5.977191304210348,0.9027018815578645,0.8724482847224466,4.856195039664966,3.91019773200596,0.6512185811336354,4.921029501684297,3.095968351696622,0.1988432266671936,3.028132454415682,5.174817612639129,1.4404532136832107,0.2841542610536798,-0.6649983402623596,4.298850154538758,4.005939156981139,0.41514260170109973,0.35784741889834243,4.584422992774157,-0.9386039195401125,2.2877298661283656,2.9030394993304234,3.6634922828889853,2.4466024680871317,0.8076898802188452,2.6053180692236126,3.900232947630832,3.182934081874336,4.940646053901495,3.784898625035177,0.8085909045771227,4.210848493981139,3.723776680218598,1.8321337148332058,0.7473405625424037,0.8357715061753174,0.9343957787805061,0.4718520387454814,2.654824660331379,4.195702206911971,3.276617176241479,0.6864215670042476,2.2133617046233676,4.507110659853491,4.728996173441695,4.228577411456966,0.9966916865788669,1.9015382651434043,-0.6778585868932397,-1.431415723094968,4.584510767960063,2.883995941608938,4.2440989567594904,3.160286855836287,1.9652472230557252,1.599068849079529,3.9039555164107034,5.532341815373864,-1.4606955847945244,4.523196241993522,-0.20913838858765407,1.9205871873198834,-0.1900685171272721,2.8580785019902377,1.146419989090313,2.536236719623227,0.30921752002549246,2.6500802261716223,5.037264884356456,4.44454357374379,5.132442840990405,1.9769413891131844,1.0431937297307958,6.0326616762641345,-1.5969681575464456,3.6134214692971534,3.1822626377380097,2.7398269951909207,4.203659819274666,3.6356487393753767,2.7811438307088925,-0.23422895973651242,-0.08938864224862098,0.55675734125348,-2.0371122265359394,-2.4912918846442715,-2.056607572758033,3.719579299472595,0.39081305902775715,1.4412489983379568,4.389863119288018,3.3967110598336983,3.4691250526775965,2.131865928236962,4.042444785610975,3.191660433448847,3.252577101699374,1.8474767460135222,1.7827633573294028,-1.4782663829725187,-2.142748471064286,3.1223331548849274,4.362498782991376,3.8240427876281804,0.7122969481179802,2.5821443580129935,4.188477211529061,4.14736813387279,5.203938640182914,-1.7046025307939288,1.534267525332102,1.5789037746122268,5.411607963727271,7.198601282654434,2.3071920245337934,4.205916576259474,3.366845619663131,1.8821531904044282,-1.4434286131708374,-0.31047020310040724,-0.3326533703066983,1.026560475164826,-0.7937999422997842,1.6690531926901073,-0.5342412646265211,-0.32262041144891324,2.6177449980107004,3.1188630732418674,0.2663879044388455,3.680375143067846,-3.4768200661630213,-1.486105941082057,4.815432064389434,0.40472967304130275,3.144158918646685,1.42539042478428,0.6773160111622782,-0.2781863448158922,0.656302530438426,2.375503262404195,3.583048605858868,0.6135776870110283,-2.726734155531355,0.04423488806059458,2.763066417578837,2.280048163556732,3.939574104676161,4.313031010737455,2.559077027611032,3.384313873995244,4.490648841590915,1.9459273434047537,0.21856719187814094,0.8063518747191182,-2.6474482338158887,4.25948313308966,1.6213335900404318,1.573887156350539,2.4190567681387876,1.9437732730424435,1.1898248590341685,0.8168448425718806,1.6729839126959798,-2.472265872891627,0.054004272571636106,4.0245391751296,-0.633849317127097,4.197871590295852,5.677803346304848,0.5260189896734505,1.2139905191320066,5.180144079430165,3.6913148339424073,6.0322502639174,6.16000366472768,0.5420008351483411,-0.2074884187990401,4.218800789445696,4.820305454816105,1.970834789661822,6.077158772371941,2.08807044190034,-1.827986384024086,3.874843532124294,4.722160759385284,1.7983403033543843,3.599160762155875,2.18543987246479,4.773050872032117,1.8523460732776627,3.930967722025629,1.4160387835373076,0.3619086430482767,3.8402125928063238,4.148807776912457,3.667140277935551,-2.151097022107071,-0.3397522423882572,3.5806096342546523,5.478075892262594,4.723234834853736,3.3738609722958337,4.048147209756044,1.122596363741407,1.053491695455922,2.638357307648608,-0.43699663929106103,-0.44656834880245766,-2.5860903298346094,0.5979683426305258,4.760478491875175,3.0990224648682956,-3.21911511754885,2.474218080393859,2.1972371321201805,3.72478518018161,-2.1994754560026455,2.1362672891783205,-0.954471937298619,5.173000903050815,3.673775072220784,1.9949883886264326,3.3240465351480473,4.468529121244816,1.4833653369534125,1.9134329728547583,0.9399753088613805,3.568495294913548,0.46723165485105145,3.758626987173333,4.387616253767395,4.114435622321826,3.3116042358178865,2.211636014238903,-0.23985069128145742,0.4995990795092011,6.741672813750572,4.598285753682919,3.8976371288301346,3.7462050732689236,0.09790556816532654,1.7310834575607597,4.609046720287763,2.441911771488033,2.689741673175294,3.3948423674262402,1.2622680621593676,1.6439765888346767,1.8181798316544793,3.7177894789890695,1.2101070755333225,3.4440755014333764,2.377725203263923,-0.06231321093532793,4.488083714501867,3.547586080507056,3.604184632196642,5.542494595526723,2.576901523336885,1.947973809507629,0.6849112003935182,2.127985426659209,1.8068920150371635,3.3272761499805608,3.6258819850196837,1.9060785654512526,1.7894878945618262,4.99011788412042,4.672845104738613,2.9269139437302076,-1.2843902458159993,-0.9941870153408557,3.845807772401406,1.9476707762453451,1.4748318364697826,1.0079787488241394,4.881016018573129,4.657121596694865],"y":[-4.293526746562771,-4.649185205791834,-2.3760261758100203,-4.846070122904911,-5.26859977502556,-2.71571607492357,-4.860403871421032,-3.3300507519043774,-4.749409665278396,-4.0460160539182475,-3.7949732901118236,-3.6019710082705543,-6.413215764703848,-3.0595843543992225,-2.2827365815281095,-6.921035814192567,-5.986913935740759,-4.739275683247688,-5.599118646796137,-3.5335373893405553,-3.727956640755408,-3.8486170927040537,-5.191782392947369,-3.838547101665924,-3.3767404433436106,-5.533314612028797,-0.990421244382997,-5.801928626455756,-5.490404769907537,-5.737912031017966,-2.455939586301426,-4.933342093535332,-4.316804730670327,-5.411070803370921,-4.408747029407822,-4.495491106925153,-2.6634619526751537,-3.555988332751765,-6.250116408554465,-5.838468674574973,-5.270006492974719,-6.539545995498191,-2.734295322501764,-2.8797537803395548,-6.471238278869181,-3.7699240232094375,-5.628871538188622,-5.3867997386928,-4.819066752373219,-1.3986206496871474,-3.0891803095088077,-3.8269820673447907,-5.424721079754785,-1.432240829102518,-4.540157127580426,-6.262625508625035,-3.6018587760068423,-2.755798940155618,-6.688073847228561,-2.9997762147686293,-4.901085717791098,-6.184405666534441,-5.22345013443418,-3.79812106137127,-5.480906177358909,-5.017300736949227,-5.094222091347042,-7.096936918085022,-5.447806592307707,-5.648447475764321,-6.1405379994907445,-5.093324260486927,-5.138494705081696,-4.086562037426839,-5.196606923607738,-4.633381067994,-6.048468092998935,-5.90793872972319,-2.309230796016018,-5.742188667298959,-1.0056196675575286,-4.185002845117067,-3.382271042179053,-5.077949015947236,-3.5609901114203817,-3.738696652501664,-5.7990830501299335,-6.381418152006531,-3.6369025356894107,-4.103761475933328,-3.8062566235263433,-4.741416383703676,-5.3802038818125775,-5.326696799490073,-4.380952654086544,-3.1013267119144867,-5.5082160838184455,-6.984755027360926,-4.6987317375544055,-3.072173602019222,-3.7518482600357466,-8.043632924117896,-4.453940860059627,-5.999660951169023,-5.847722843341538,-5.7526664435100585,-6.13610434718741,-4.883348237409706,-5.179056412464668,-3.633628037771411,-4.137201837291638,-1.5974116572148431,-5.763586779366692,-4.277406384641435,-7.171038138920175,-5.149460199256386,-4.645322537243326,-6.4621750796312885,-3.7281981974717557,-3.049607823015226,-2.76024607415423,-0.5876435888704138,-3.461953090402135,-5.463921521316722,-6.243509383476354,-5.981233785168821,-4.931659942091549,-4.57194910062849,-3.9076713069515443,-3.5614694790792707,-2.867141629514083,-5.894864388886588,-6.2283797963178475,-0.8480278096396577,-5.616856848735912,-4.441326848595068,-4.640969282501832,-3.6168493580739858,-3.9717780898892263,-3.446658560676852,-5.194803855345608,-5.032742874900873,-3.1937579326133294,-4.350002073676546,-4.977512443032,-5.3597267683584615,-4.759608572643726,-6.886431814413232,-3.574103737281695,-4.6199702110684475,-4.895807713450888,-5.77936349429152,-4.046917568433324,-2.903294114337594,-5.228971659428126,-3.8868257628096035,-5.170817578845738,-6.398484515901211,-6.650587977475967,-4.578023637225334,-5.58673950870939,-3.704404081160345,-5.281051819468986,-4.489466706908343,-5.133865012964955,-5.555124020134992,-4.406524063276264,-4.562279007227592,-4.386506378349552,-4.039966258019211,-5.438037386675728,-3.375253203596697,-4.531121155453747,-4.562733357358679,-3.903665818723515,-5.687352178680944,-2.7020738481713527,-3.899794881402463,-4.442404051047018,-3.8034111993478996,-5.2831886802164005,-4.958301251235974,-0.35217571682608806,-2.079892533845727,-4.648295310748082,-4.5175900792792465,-4.619887222321486,-4.6378885352592345,-4.890022307256389,-3.063801999494816,-5.257185366454281,-6.132852392059471,0.1729792608485365,-5.209707508734327,-4.64979980822815,-3.7443415057080096,-4.838074550111428,-6.7971260076530235,-5.001826701951284,-2.5025637250594337,-3.433168323703949,-4.318376102208881,0.3082287375551908,-2.8832584328113438,-2.5131418613445216,-4.52215770077637,-2.9486736028025127,-1.5703337687034633,-4.377116555663498,-5.388831001997222,-2.1938651336368826,-4.616419777925635,-4.417993412009522,-5.071489149970775,-3.5171000995148596,-5.757764260075633,-5.203982377864495,-4.940491945270838,-3.0116970259961353,-5.204884950912119,0.04061133426272159,-4.446509971680956,-4.337492804369532,-5.083486403081655,-5.616145562004284,-6.2011768166492995,-4.2217345354937414,-4.263409562125612,-6.483590717999454,-3.404994244667157,-5.828832258788212,-4.596424433043154,-2.1097816638390854,-5.449522812293851,-6.4512171391834,-2.518819617768747,-4.2945550537863,-6.811159880937739,-3.447877014114799,-4.762452718530953,-5.008350293745368,-3.896403709221213,-5.641143289722122,-4.2827706290461975,-5.381668882448171,-4.749105395617979,-6.4883168421209865,-4.622069962380127,-4.107597142596164,-4.737309818696596,-4.140563111226956,-4.951820431485793,-2.4359364583481953,-4.164734840706297,-3.512934591822103,-5.265461385682736,-5.077694713769946,-4.412081585581823,-6.487796848615561,-5.67231121190207,-5.280594467447157,-3.9995059819643357,-5.423783234370081,-5.2902616121897585,-5.412599992295013,-5.483507445194505,-3.599832863131307,-5.048338933587083,-4.036109769859733,-5.067601825160846,-5.7955388604297955,-6.354254702762724,-3.3000051630747302,-3.8792673397059527,-3.444535358615076,-5.682507149247641,-5.354761582594468,-1.6838353315945973,0.13561280919435165,-5.742577157148096,-4.987003844150447,-5.031557435607041,-4.4766929050623565,-5.119173789355778,-5.181653688460781,-2.7033358668582035,-1.8983381273022728,-3.5549648137659484,-3.3986393107498327,-5.2215367692475505,-5.523584772273837,-4.1138705276328205,-2.573515056451186,-2.4278237608868545,-4.576138741155022,-3.0598686763921186,-5.002264476782955,-5.219037344361048,-4.135926252551077,-4.459794401823848,-5.1670005093914115,-1.5557454227229186,-5.511777873871959,0.751999004786502,-4.440051289268128,-2.404687421212673,-3.972444741852792,-4.912340647884834,-4.554180625327969,-5.405806484172207,-5.131742511541659,-4.998256207911507,-4.230459104808159,-2.2515514262853245,-5.177052977327375,-5.371520154473935,-1.4040125156043546,0.5858411199160356,-0.9809647386849886,-5.30539120410622,-4.513849054557214,-5.205864170505762,-1.784831827648778,-6.517001764107895,-4.551417736873368,-6.230913683718329,-2.8445676787304395,-5.630492100633484,-4.968189214927655,-6.703175127833137,-4.70576076939024,-4.396470194941328,-2.5009589085962034,-4.609217446565508,-4.359280442021471,-2.5608042174243035,-0.5676686707878705,-4.08808242597159,-5.777397582477956,-6.286248406251222,-3.8149483240534834,-5.580355102715207,-5.103753597899123,-4.853839901108859,-3.4952197394240123,-7.288194106259896,-3.7466194533955837,-4.413559582358907,-5.1365246449611,-4.10709803915076,-5.4300184580938815,-4.266045107754278,-6.389623338711645,-5.386839475109157,-3.7914255504337424,-5.77230255067094,-6.46960262783036,-4.059127048778389,-3.3066204218912656,-4.970877694578271,-6.877786036346554,-3.8921473790363073,-5.274016041481469,-2.754777853032873,-5.373216945166635,-5.573616390118236,-4.3986868941485495,-2.7085948205202572,-5.047999059773245,-4.3450835479154675,-1.883202203328933,-3.730072121794562,-2.978623096870055,-4.269430533870951,-5.199961764746979,-4.395444147168298,-4.744041440417312,-1.6371153042009343,-1.032286436057011,-5.453471316930343,-3.6006571048791307,-4.170833726132436,-2.631998032962689,-5.392872540892087,-1.9319859905051031,-5.500617712769338,-6.3674563904156525,-5.1479566744250524,-2.3643807601326703,-4.508317811311908,-3.2708306966092833,-5.418002898732334,-3.8960228243277273,-4.665803942656897,-2.9747288554418065,-6.772477431354345,-4.3344743243685375,-5.339363617472728,-3.8727790702075495,-4.014026481537144,-4.107263810507002,-5.921727454018778,-5.454157674708993,-0.3786473638200831,-2.0044061704294514,-5.5016072702231975,-5.153778557673524,-4.648701367263114,-7.014082736302449,-5.1825068813820065,-5.452525351896504,-5.267825957198583,-1.9317832460433544,-1.862584974046276,-4.809372910067105,-6.134502936239254,-5.445693725117414,-5.751359596090839,-4.403178127665599,-4.313096019214525,-5.516280795431948,-6.241047707916643,-5.568704880677289,-3.4087088200116895,-5.169101996502257,-5.634048785021371,-4.4137583591260965,-4.3903198700282235,-5.9909449593122925,-5.298865767264496,-5.7822632647651755,-4.809026905916193,-5.8925735058849895,-3.663544287458279,-5.166432541054696,-5.9862033579588605,-6.10127012441391,-5.085376026066273,-5.812239802615643,-4.169870599205167,1.1162143857793636,-3.67890035608881,-4.647487542104193,-4.798928954338835,-5.324922631459533,-5.874554533379471,-4.640756765862384,-4.848448543549192,-5.119492783112197,-5.688160055159521,-2.086539798305827,-5.658144224024082,-6.082803310442942,-3.9536470922479943,-0.7674155572638799,-4.8564644501854835,-6.423552640283517,-6.344666291689059,-4.771978295347432,-5.145417146546271,-4.990571500931999,-3.7150563191829953,-5.478473573748936,-3.2746843153898912,-5.91827229125855,-5.6466491973900474,-4.58570441887985,-4.7239135674803645,-5.6049659964829015,-4.667166959308213,-3.0923244918181165,-2.2136042747089477,-4.45470356871965,-5.174969375809606,-2.569161652964995,-4.048659983758037,-5.490332141867436,-5.951898076577275,-3.4381188878223155,-4.310834142344805,0.08412361375091156,-4.830279122883557],"z":[3.909241826018185,3.9897013677878923,1.602135921745672,4.2773618835214195,4.201984267865262,5.895417169866592,4.15484028500368,3.4942500604071802,3.18081712767426,2.5470214545284064,3.697304379878596,4.152381005675228,3.4031513820555928,4.605696308756876,2.167874655968651,4.822804762830975,3.909775639080589,2.7815229856369585,4.233877109361068,2.7798037063407572,3.7383674831379907,4.849767283112262,3.995733513301636,4.48085167667398,3.39874532573441,3.5582858970021722,-0.34870123084820237,4.573646669753997,5.847317050236475,4.1879409516465715,2.985714594044344,4.43465565287109,2.8922231653688635,3.804284711857044,1.538709819171639,1.6043031861395292,1.1829576671932616,1.0174858256029382,4.724493989428004,3.3574383040058366,4.6629539807103475,5.004238908637458,2.440685655418188,4.7716448261119035,5.961607323124283,2.4051351178168234,4.6871297312773565,3.365290856938372,3.110096563905558,0.1019093550837842,2.125264527529905,3.1654135805887167,5.308685947902186,0.8331993078977527,1.783390814090493,4.289509704241989,2.3542811603622122,1.742407764184449,4.835754486688048,1.314040823601704,5.1989477473332455,3.182632233384764,4.76229593852078,5.1005427749415215,5.715677306854511,3.894509034681238,3.671858710537672,5.632760165374185,4.6159512797197015,4.261080485373518,3.9003972278175105,4.974928329384598,3.254290950246428,2.9585397829310964,4.250268619456715,3.0397427600834326,4.137134941533603,2.634469275097188,1.425797049737669,4.2373440784477285,0.6384227182711624,2.9071891873118827,5.128085043463888,3.1988072251701647,2.060952370228231,5.336743392766301,3.9131118293511102,5.494011157703706,6.254086374274558,2.50554549041135,1.5639782499545052,3.0384214465279276,4.739400140369192,4.497887508294797,2.46159370478826,0.9743081477797553,3.5935486865183695,5.552707470836688,4.0439500695542625,4.876939986899067,2.4254615066621112,5.041629716551661,3.1939102325202104,4.513981683965942,5.387566288629915,4.644541766410463,5.146870807706369,5.2975564509402675,4.920113071517098,2.418922257694909,2.659552554507525,4.313918105486878,4.70007429686428,3.958188141672232,5.280924726413842,3.8469508748135346,3.1486990047373897,5.990632955855953,0.9142803997439867,2.071364544417168,1.3158962016402873,0.2946467551693975,5.06123151862099,3.1965800160330464,5.145538754006955,2.7766801288076217,2.1018121008797426,2.0198720978300684,6.0897636691894945,3.517137881380846,3.357871130695673,5.459361724102696,4.9993011869920965,2.025617496702362,5.269154308527668,3.9980675980382165,2.100499789663772,2.455509794192265,5.272357555493916,4.4109554106717646,2.920198033158465,2.794066530769276,3.342918326796249,4.706087936512825,3.6151270393117017,5.764225312671553,4.085216031175539,4.611091478790378,3.1793783108138394,6.043550978973701,3.907567842354478,3.5161618027223613,2.101586222561133,2.072577732530126,4.147539920563996,2.4407467784518535,4.050665166134361,3.1929298087183633,5.084277409878994,2.761075980513486,4.209870624965735,3.931526290758989,2.838565920319008,3.0993082936145924,2.8816864513905123,3.183900412705067,4.295123297246143,3.2682875216687357,1.8159781345799373,4.128939498370047,4.186434749691198,4.235706055345534,4.046851454504451,2.2364472996274625,1.8941897854104093,4.797011453126886,2.4615958621988647,3.568006389609142,3.9103538702841316,1.2025245159169624,3.4281869821543483,3.2597540503783264,1.2632628889196278,2.2721839731158604,4.72842704647038,2.8834734856658315,3.3068696718631294,2.5683352651949583,2.9937977249580245,5.05399459289058,4.860287324781192,4.384159053807829,-0.3925010978498572,3.3737773939639553,4.764069532886823,1.779747669721354,4.329509628498015,5.992922291743097,4.92131514300362,3.7036016539605057,3.719790716989165,2.0363397568670654,-0.18062048518469878,1.5178714547924939,1.179889891453308,4.94871598273028,3.852390128569121,0.832323925478964,3.004025794233612,5.668829243174429,1.2598329226903477,3.0425412150636233,3.9802824298695874,2.1058480498449863,1.443907461789015,4.733480156901012,5.087742910614394,5.39301677923601,1.8241752746962516,2.4124590418169207,2.3593833337051318,4.595271876158137,2.4289630083611295,5.785302000455092,4.103031066535663,4.420876301349842,2.9121720906686517,3.689671970931948,5.24025898514297,2.769902116135705,2.6216058852754633,2.8695266383416858,1.29824425593144,3.265025468171101,5.787923283996699,2.2115650270981866,3.030697843869674,4.299985854415008,3.3848057958657516,4.237723274982136,4.4618691788272375,3.6489277554171378,3.7410453876843657,1.782125169505476,3.365653372928683,4.739691043796551,3.8585324157207745,1.4599880657604876,2.132064292380636,3.152923075067817,4.025026475300289,4.161737913842497,3.313558441757611,4.527911938434544,0.4842960293979146,4.267292313590107,2.7156838142073334,3.2418774802020724,4.7831108584748305,4.363818099010635,3.372295604434787,1.4724193700349202,6.213189664961302,1.724728706973073,4.887983094543053,4.024769740695619,4.11546396014597,3.7073909573405968,3.833360043583119,4.161267197007358,4.67478359421668,3.861041550268933,0.6481926838734557,0.9236524277428771,1.8567908176030654,3.6346544951639865,4.376161474137459,0.11740454473187875,2.9229700429854666,3.6004006574707583,1.780304903341873,4.148641535609487,2.77184027601673,3.113107106529473,3.043987367949649,4.041193284148204,4.450334402902103,4.040753921167639,3.6462302018061328,4.566778597419968,6.055865570846233,2.7938591041960317,3.190709253327032,2.511871049552249,2.542465753752948,2.6805016954683145,3.2379237501292892,4.152699483333645,2.5948259865808305,3.5817406124016413,3.7711060205239204,2.4444676877337232,4.647186549508229,2.7541897169484737,6.164137613894297,2.946110655020798,0.7090203537326255,3.0167948486282032,3.562829155799401,5.22734143072503,1.678649953518749,3.117511728903769,2.2732042129914634,5.725010809562167,4.170722558207754,3.8218452568909864,0.126592724113768,-0.7298378114529903,1.7432180102671504,2.2377291702373965,3.4593254311840287,4.432124475725426,2.4535795045354707,5.854964331658043,4.661173376078276,4.870355856353139,3.38366248152615,5.199443035345567,5.420183683409278,6.340258110210193,3.6418774512153607,2.936007869739237,2.14734186030911,3.3274191256032926,5.993237802684344,5.502181141191849,1.6060580659616701,3.1793640955854943,4.046326586329813,5.274433974220304,5.338006455366172,4.197775059204209,4.779654823981469,4.071291986951072,2.654258668188251,5.825515811355342,6.702172743622513,4.6049583034448505,4.112715804549515,4.218882860304875,3.8081618415064225,1.9956464561188887,4.331294437546014,2.78028869628079,2.0334798955385294,5.639538934493867,4.92558011831623,3.4130296389271306,4.358328886967263,2.706823155877607,5.07087374562714,4.5267396774483135,3.7862281302938148,3.733259065780403,3.881368323218791,4.935599379306144,3.662694532718515,6.294694326064969,4.333526009062354,2.3808386799956915,4.464704042353412,2.919981300666322,0.8517718953783668,3.735560282889126,4.468246114813795,1.9336624692389108,2.733137132556211,0.6781128807188196,-0.22292088980320862,4.873543648609627,4.096095363727024,3.0491611223729107,1.9866758083134122,3.7799587129504704,0.4320448018022669,4.731444058966768,6.60285667775581,3.8238676137397025,1.5142753407225769,4.475004514401277,2.2871845228280585,3.3471609891984055,1.9556251010973205,3.6033431348542644,2.01281494011124,4.355943108303598,5.1309389827079395,2.9835185324395983,2.2543313157983045,2.7776013001778908,4.0815003935717025,5.385035116654019,3.6567127218082742,1.0982500630709249,1.4359248008260417,3.9326916015509976,2.735813105108515,4.670769313300296,5.125336347633697,4.4798148404625495,4.5822358641890295,5.226539462863574,2.9835693199066182,4.042263448621874,2.3718351855057453,4.156508199003726,6.420542827524742,4.290474184066326,3.3101142598295676,3.239985429493333,6.563272166759116,4.79778654900566,6.662881721525965,1.3760275693716875,3.310659319424868,4.231569800662039,3.1412459843679157,2.659167485817777,4.598353230906607,4.172360529008369,5.016433310053684,2.869269036692814,5.030084261395894,2.2966033680298623,2.3553065931627284,3.121748725200012,3.384764319170711,4.989724623508226,5.265735846913548,3.944225130230056,-0.7390684001157051,1.403830295020201,1.8177461210808343,3.1798325927959716,5.079305102816706,4.6779008454091,2.0622069660946805,3.342710330213261,3.6729089312067775,3.854063956085894,2.865336939208816,4.972212192713011,4.900381096059193,2.1574538897856304,2.2743976360727984,3.4786301746425696,4.908960431120274,5.840038149217402,2.742940753405678,3.137345535643697,3.5435176708463256,1.3219826963985832,3.640126120879957,2.645646501074327,4.100010976091854,4.158193601129176,1.8697479493931874,3.250711775980966,3.3700674701931845,3.9820011600055003,4.364902660414203,1.342537066928917,2.512869954426854,3.7918338843048,4.54298682910932,5.134415796854222,3.071633177661012,4.855367930308348,3.4768686922468235,4.078582728916148,-0.03519966029926678,1.659552387291035],"type":"scatter3d"},{"customdata":[[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7]],"hovertemplate":"class=Sneaker\u003cbr\u003ez1=%{x}\u003cbr\u003ez2=%{y}\u003cbr\u003ez3=%{z}\u003cbr\u003elabels=%{customdata[0]}\u003cextra\u003e\u003c\u002fextra\u003e","legendgroup":"Sneaker","marker":{"color":"#FECB52","symbol":"circle","size":5},"mode":"markers","name":"Sneaker","scene":"scene","showlegend":true,"x":[6.879952534776713,5.982089563971865,5.925420671006171,7.081075683044949,6.118763379261986,2.7184840870610745,5.050349237708237,6.691885239863517,3.1126140864827154,5.6075592107672945,3.8759187927774663,5.493807627677794,6.2177985565973435,6.43665099725513,6.532880432514595,6.701407859512574,5.726154522605133,4.552582135826833,5.2296458813907005,6.542507381812985,6.375499844937782,5.2780020622837895,5.966516983007194,5.4941244051957145,5.3867812404284345,3.298536298257283,5.773340760517515,3.8425227974907177,5.55504433955582,5.928797712091648,5.484117807824886,3.9415015800269066,4.792631378445863,5.816066864370604,4.3046466448410445,2.2281626929869125,6.824098550848895,5.594876259467801,4.818879157636247,5.240302353963531,3.5165321229783153,6.554452592504601,6.367857055755975,5.951924044847959,4.234755153191259,5.455475188892129,6.165269665344533,7.775465851910647,5.230425115067284,0.4467270315929302,6.214843099604524,6.1144657991456794,6.474261629726857,6.15192198257351,5.35467558462946,5.547117579920744,5.096091569402333,6.470386060407143,6.732435335890094,7.204607464622075,6.072608405988131,6.583045382959303,5.114856461609235,5.263824279482672,3.908859089787225,6.626856156785429,6.102208322767053,7.03688729872563,5.733017376213551,8.037725335339953,5.230140827403865,4.041534692344324,6.550345462093906,5.9123509142408786,5.892435717539396,6.236975368821664,6.238110740661092,5.995478258816172,5.24888274704316,6.675709983251841,5.9955095087173005,6.046297326060617,6.359443272023598,7.155045424996201,5.916459411211834,6.552395786128638,4.852108978889602,6.064650391098854,6.086464436812616,5.143128772274334,5.4276451520571944,5.465426807929932,4.5402127825768845,6.700703252180908,7.0670920061840174,4.552205694328385,4.658068462254035,4.555274055954242,6.732659430750664,7.0107271274360325,4.941035841604153,6.275190388258238,6.142205537589254,6.744145388594412,6.377855716672858,5.603466566267635,6.034111092332555,6.408465403890259,2.099034388838752,5.711510846058799,6.58836939931852,3.4076384979026506,5.312236016915645,7.290969788390591,4.485490267753958,6.1226809339980095,7.402062570032589,5.947376862202525,6.104822430937709,5.710646068756428,6.171046614498708,4.9158701320427705,5.331335401317375,6.161375091458097,7.2666274012895204,5.356510110537791,6.0415266718256495,2.421676736766995,5.810896678277994,4.294894322748101,5.350477479076644,6.582447068650899,5.569245966723292,7.147873689657126,7.02702430469777,6.7755338637342435,5.904743336903301,5.013593894886929,4.897564490120688,6.110196770701658,2.2675186866439017,6.996947406062178,6.3309252323412055,4.2441653690816095,4.608383889944139,4.196719791552498,4.685835543911958,6.797777448993591,5.379575836478577,5.5478106284766415,6.688444856669875,2.980484423129699,7.126155311413861,4.302488292451337,5.770717889521524,5.876192994411495,3.9208603319878503,5.732532327242309,5.739949322054075,5.98849013992589,5.41663702216788,6.2764891466898165,6.05676227854671,6.105305008376854,6.542133175697059,5.58488131298064,6.981131943362931,4.321153304714044,5.255767583220272,6.508505232504944,4.557755916482651,6.221935864042137,6.467167823685434,5.309612784047256,6.35216483162756,6.758537343633267,6.609641562010094,6.379034409497809,6.894296540224997,5.749401172629308,6.313155495396717,5.156010730427768,3.563201042274876,6.659804545007884,4.936022690713424,4.945832629031948,5.646244922240861,7.217313205123884,5.957792477767154,5.131436105334332,6.382210651449508,7.270570190723358,6.345301530051601,7.111440287509762,5.761380527439529,6.025216724184849,6.313134059385994,6.338260335874555,2.9499295573274633,6.78914579501481,6.18126407117741,6.715091993193353,5.866038874893234,5.354247326241049,6.638184466459147,5.7197159003282785,5.9414498458270275,5.381340507157733,5.275982955406848,4.805115843045317,4.043315700616421,6.477904323609366,5.354092853869885,4.953993054775724,6.478442373843548,7.590620332156113,6.538253879295133,5.711219211952023,6.177449593463179,6.121252163932817,5.8683670836965725,4.654996038877493,6.522688680991203,6.659769141036528,3.8391687795828293,5.713787742049743,5.844332354636626,6.534084969387598,6.506736423130348,6.86654653821152,6.752644522391336,5.749395254621426,6.177044674516354,6.154126944294782,6.364452253220834,5.909610872291601,5.067566856232337,4.386114115308748,6.265044171364777,6.436419279372014,5.227949960772826,5.706467947825311,4.924825947986477,6.274561615381429,7.567869372883224,6.511016797826329,6.395579051969192,6.796081848139739,4.7906414398680965,6.06575731081877,6.190987770355433,6.571795949215993,5.471409971573233,6.108367479003507,6.038770181163189,4.01192252128431,5.493870906293713,6.721482562897355,6.623386825124088,6.867859414649836,6.126607957803057,3.949990976718181,5.975729863363753,6.9861412012958715,6.817797811811948,6.5449057537845,5.619260135550383,6.133683978434915,5.887759628226198,5.485946791419324,6.013643645460737,6.827738477685491,7.082749524062607,5.810978152083264,5.425950553620897,7.724189092102572,6.317949190517009,3.8264828214547393,5.599090725963218,7.124107990561383,6.197116261700081,4.247133650628759,5.3999096312072865,6.801170321981311,5.166296255898663,6.650440792635097,6.2462985546443965,6.947763786204955,4.993695061205842,5.458397208098467,5.590930739330596,5.567108172143049,6.937212731819538,5.290112888393562,5.513846962665205,5.823897274684229,6.627284135326099,4.937618271733017,6.247365821965055,4.248121075442187,4.389294613095723,6.661929159840076,4.925590800965649,5.571325189936067,6.0262028314654,5.386792134616256,5.976277656001575,6.053541727789978,6.286705154138219,5.498298822781329,5.094836729566035,6.633774912943537,4.260896323184861,6.710257583658145,5.7703139652496676,5.559529801099385,2.840599985703237,6.014489871668519,7.715131219783854,2.981959547601315,5.010298246561916,5.41076832166186,6.041862998044474,5.552939326593096,3.859355353380677,5.916424033118344,6.272008775475942,4.693464681059294,6.238265197908263,5.768362329622024,6.114964048912075,7.1178039587978175,5.347521437780738,6.655526822495258,6.831811606916765,7.198019391276585,5.1033986208616335,6.575780671520064,5.605782171397938,5.363183943406635,6.302461423970886,5.680092521863322,4.094387842241638,6.019452817081535,6.3927238671608135,6.264615973866931,6.573697596289171,5.830733735528473,6.500729192588042,4.939684822207094,7.2828235156742815,5.6900069459427645,7.295818844155561,5.026907442793663,6.809240792306515,5.455289138038209,4.7748364392193645,6.178856055274254,5.545132848543785,6.0706015102927005,5.633858874114059,5.6078209788650275,5.736310752651953,4.702727901885681,4.130741004461783,5.85222375940007,6.330815225758871,6.065575555066102,5.013800022963843,5.530928811801583,5.765806926694034,7.496103815226859,6.121139891989151,6.684964413171706,6.523297034936206,5.284372951091235,5.325055539316608,6.502302929614056,3.8848879456566983,5.653895332227024,6.314777781064837,7.001063146953261,6.98995525735456,6.145715424585838,5.9127337308698085,7.546454667607095,5.976281247147436,6.458142906313955,3.758085040668007,5.790346636746956,6.91330594150194,5.12813450531175,5.568639271484325,6.872410504442051,5.4246826506455434,4.634578529402423,5.333790893159055,5.430558618550557,5.585413789395058,5.857449353825847,6.212274805897979,4.079070988238216,5.774519322699468,6.93902209313765,5.071409644809226,4.013063960653183,4.8041811796806755,6.464375260584768,5.189874676062123,6.359461915941749,5.3831476645853975,6.6646045798937665,4.528299637463933,5.863550849835014,3.3560180962782735,6.233340108040634,5.068387910043503,5.2657882649053045,5.234853851575567,6.2268162646153975,6.706569710260096,3.217632684613411,5.743925118739381,6.484063743937992,6.190775568364553,5.022557911180708,3.5706290548832147,5.63604098306405,5.73587582591712,5.510604121740681,6.7169768362261255,6.828593632949076,4.596316960173392,5.997672136164362,6.139967707034034,4.845112295509155,6.4273126080765115,6.500884091364348,6.254987515136725,6.695793379214988,6.14106853219869,6.664183062146999,6.016747168914306,5.538279239064873,6.726944843896628,6.309900613316838,6.500287894504468,6.9162469677421585,6.009363437091647,6.679746466897995,7.2705294757954535,5.590459799433111,5.854644599456985,6.26503569018322,2.0596274443233047,4.969147718343675,6.888372814253286,2.7951627667910732,5.631510788533323,6.208976175060072,6.937104903276963,3.804428654696191,4.833228007815272,4.957786773569683,6.486300405236567,3.6564368424967832,4.88588382277439,5.851944270915605,7.061741460978102,6.7975465627598375,5.508448569288588,6.2259959495853705,5.186011012903611,5.302186067683947,6.051394208531528,6.22827421423249,5.872396645521261,5.30958173319879,6.250712498962201,4.498531982684882,6.68796039884998,5.068725616961356,6.373704950169012,6.917529195035145,5.68250859072875],"y":[0.12316050515504429,-0.338904329214952,-2.038440594120474,0.35389272730694843,-0.5510351692440535,-5.453318289578041,-1.9099330783892237,-0.5208122285719151,-5.544435587616039,-2.5073917144489766,-4.547286257061748,-2.727594506933482,-1.0431782312070828,-0.6875154157708835,-1.1677519541042056,0.08401530993210944,-2.056009663976638,-3.1880438913821956,-2.165461433279827,-0.6358138305038422,-0.9263775150850457,-2.257984135515713,-2.0812339072782384,-1.8829665811374037,-3.0203955566481326,-5.707246309033908,-1.73609867322513,-3.9011057787199532,-1.833621018569891,-1.1521429599605493,-2.011060596248027,-4.124384270446803,-3.713506151515679,-1.3111289901285021,-3.371299400639206,-5.303267640500716,0.15581529641266767,-1.269190020649375,-3.497473682281761,-2.8664159990534737,-3.539910940294085,-0.5415463514586287,-1.2375925073454739,-2.869706069318206,-4.056241014001318,-2.019789843030703,-1.7889613198251129,1.7289585952862945,-3.0784848879301823,-4.434451526707608,-1.2991137712531102,-0.5642152015538936,-0.5846319316934436,-1.9027983260332872,-3.370568660853806,-2.6160401657874037,-0.5054204315410332,-0.4153369885908633,-0.4083489955992704,1.1589848368927198,-1.1920296583043446,-0.6120935349586,-3.5211360239273484,-2.964274877139154,-2.9163186589465715,-1.8361940258466967,-1.5553438506203,-0.4840148275876058,-1.2530313914125442,0.8797661744645853,-2.980100551587348,-4.088743164961247,-0.5114695780405408,-1.302037134776107,-0.2702130121843182,-1.1792606557918743,-1.0486600632163645,-1.5287869823064162,-2.9849122772317798,-0.01758072429765805,-1.518302392757443,-1.078322208802566,-0.6771724210882861,0.34166316551546616,-1.677190835314482,-1.1295930221109012,-3.286787259129187,-1.5014693467694407,-1.180022875423241,-2.6080467551848465,-2.88515601076825,-2.795271180442327,-4.793106750135201,-0.12118329482092575,0.09651501989334854,-4.177099807082542,-3.5548404872140273,-3.178989136312531,-0.8129730776343085,0.33214644816739314,-2.807705083970196,-1.0001199926194297,0.012056629639349952,-1.365468902268397,-0.6419650871624671,-0.47083275254798757,-0.7921678850294823,-1.6626032961724693,-5.914050311509454,-2.0036993175009994,-0.744791792314419,-5.627788549601027,-2.0354087847324402,1.3450292300510474,-4.285291844068438,-0.40193386543300375,1.3271736026084306,-1.9629945559469884,-1.719504733323585,-0.9823879941023328,-0.9675171852768782,-2.8987091542316423,-3.0440552319134246,-0.6520222986960976,-0.1850805379304714,-1.1600598589281694,0.14689152411668233,-6.635478488969384,-2.17956984891894,-4.804780938899185,-2.913231623276215,-0.7824503546540574,-2.115626161551767,0.8643624327871018,-0.8825928127758627,0.7987065218499756,-1.082032977143067,-2.7870163648246304,-3.8648283006849367,-1.1167578826464264,-4.890284355277621,0.3010647649741149,-1.244541713790106,-5.1569455863557225,-3.5719918419838925,-4.843570954879204,-3.5661896307834473,-0.587885560958162,-3.049907652390702,-1.5920158303478849,-0.46847035601650855,-5.07890232596199,0.34403457965781814,-4.105170849052111,-1.0954595409069152,-2.1226729520083,-5.480770628825008,-1.5662346600072918,-2.2000507114699754,-1.4029270002834662,-2.2275367201815492,-0.2725099868764506,-0.740102883440878,-1.1665608145953932,-1.3399114544915478,-0.9635749601909744,0.09054853391549529,-4.311575753518329,-3.6708887884801933,-1.1611129790108294,-2.913474293648604,-0.8827415638648124,0.7962106537120786,-2.5500756931719333,-0.31713741762510067,-0.08307239926370723,-0.6560700962220346,-0.6448178575091426,-0.46028146580804835,-2.657062444596743,-0.4615133696062457,-2.1480180432824825,-5.888477824625764,0.09919857052207337,-3.206092146531363,-4.156774873305111,-0.817979935545809,0.8857473123474326,-1.2836461902348035,-2.831175385781689,-0.4048716342942051,0.36042391950789776,-0.6642110476774533,0.20339560825249725,-2.29705284775472,-1.5272525014842835,-0.644720046277667,-1.6447770260903134,-6.142556109292185,-0.21977619209066634,-1.094147866416592,-0.10617972752757036,-3.062435479519227,-3.1451155661337857,0.18150539814465888,-2.1189874305916017,-0.7009811762575071,-1.851484944535616,-2.6235724709783663,-3.149494735653327,-2.8752134533741303,-0.7548730556095007,-1.4987423657768815,-3.8754811265745137,-0.5562696843041627,0.8759438517152305,0.08753489025654766,-0.9972546007563329,-0.9345751622772557,-0.683837435301875,-2.5779886139545782,-2.318227382700793,-0.10612605428018661,-0.3382928551477219,-4.482427778208501,-1.3484436686310421,-1.9861136359003595,0.4603143253475659,-0.854839531043139,-0.18307199297849597,-0.7416463992093718,-1.2924220414693368,-1.3274880786973826,-0.042005697824196726,-0.7031022480161203,-1.8876143044490372,-1.9615783730642324,-3.8411248847364647,-1.2356001587199315,0.0039762576891677085,-3.5685326508937476,-2.206909652269871,-3.67162707432755,-1.3883021024165323,0.7340816713810966,-0.23623910057063618,-1.380891465803992,-0.4520635681876646,-2.0870357320956807,-1.2417244416242064,-0.8918886867911755,-0.6657440971259901,-0.8004353004275754,-1.5308790263427936,-2.0494010111522893,-5.546782004085424,-3.397615646522723,-0.3642223751621662,0.20157045905315296,0.7843692748724566,-0.7935004467331386,-3.005424361488178,-1.3900099043636642,-0.45188865920468035,0.7935610081615778,-1.1875414030883078,-2.296417865245034,-1.3987531135095224,-1.3503100232606307,-3.198504988530072,-1.031515543921612,0.39760555642845274,0.5441705665722683,-2.1223812596327356,-2.9052001437536346,1.4689174354671588,-0.8434110750610126,-2.5093462763436545,-1.646530009068852,0.6292914511557383,-1.611526368995389,-3.7553141723571355,-3.656662543520184,-0.5239907294439866,-2.0735121036343918,-0.5927442669635158,-0.24797912646791595,0.7773973394191197,-3.899741410525919,-2.833582362226376,-2.4872340669810216,-1.531346045855102,-0.05209606365270218,-1.9991955752556947,-3.066642159060506,-1.8436275811924712,-0.13259717882548863,-2.735604192155663,-0.430683988632219,-4.811327968840947,-2.403852313734297,0.09553484046158174,-3.4141403957401146,-2.328100630467185,-0.3945156172420582,-0.4027564110775393,-1.2117006363148342,-1.559115605287292,-0.14372393008480439,-3.3406729224336154,-2.9045587819384813,-0.11406893084396856,-3.0860726412243245,-0.6462135580812562,-2.620002154965664,-3.015409199976225,-1.4072790117254952,-1.420456298116089,1.0022219657783993,-5.269147444121842,-1.6705155254597874,-3.929176808908704,-1.4722890457673736,-2.3374420752024245,-5.428945924195342,-2.123605913065653,-1.499088927081633,-3.918280099473863,-0.7344995883348645,-1.3495441316075283,-1.0189706195902373,0.6804739651059074,-0.3156499913038928,0.421072466852647,-0.5071576316590893,1.298523418050879,-4.343236454894237,-0.0572590135273177,-2.799759197527199,-2.418624770087107,-0.43616176747374313,-0.08653283888837748,-3.2115746715866513,-1.5579060016177004,-1.2686334608912855,-1.5147268160284482,-0.3625050396511942,-1.5363549281913664,-0.7571000764945507,-3.7986225953812127,0.6984067255733445,-2.240811986833527,0.5047025451551466,-2.943479903156427,-0.21799790705434366,-3.303788789101726,-3.3495371915148358,0.44212151239125835,-3.290765164859512,-1.2926606459925374,-0.8761769522153637,-0.9780311157468367,-2.1248152944725076,-3.8561686326513613,-4.011375715950977,-2.6061849836019704,-0.6785790974864223,0.21796922813212963,-2.9928024598823715,-2.0801376095249413,-1.463853300261472,1.6546273975835128,-1.154615689013174,-0.3738519245998847,-1.4761938740801728,-2.8549705646670565,-2.1422051211288142,-0.5122924876242537,-3.0444358638301376,-2.1734360471337277,-0.7660232687062385,0.7139246655399393,-0.0701112742135977,-1.325301386707327,-1.2533226035782148,0.5343030290062359,-0.8526609274252966,-1.0190060490488009,-3.591058403697047,-1.7144603905537914,-0.036789818784288844,-2.5300026947570444,-1.4989049465172852,-0.003887265750429933,-1.196523503533249,-3.889160473212694,-3.721275502885934,-2.6569080522597517,-2.2247102010189304,-1.594693919898863,-0.45304103922934447,-3.0126936470762145,-2.5418654963755376,0.5688633742101966,-2.615249046007287,-4.616673597010465,-3.554242603556152,-1.445033777647477,-3.3411992994650372,-1.4240744572137325,-3.1102687659044324,-0.7571405206786301,-3.853146170738429,-1.3196937329785994,-2.8752429128142896,-1.0038438837889039,-3.4189786510823867,-0.4023957592589262,-3.7214076963616596,-2.2184281271947928,-0.1741332797932056,-3.265356444069652,-2.371964047533893,-0.7084779163717295,-2.28201780674209,-3.2148653502196938,-5.331435330019583,-2.5483219326307793,-2.9385345054534753,-2.70312859231236,0.033655196376818866,-0.03384710739156,-4.224182552931483,-1.9544530098116943,-0.6717841461895464,-1.4412336785031739,-0.5828422508287951,-0.5034945890951926,-1.187037951105629,-0.3189529482495809,-0.3477124609917011,-0.8881652640162159,-1.5055336384567024,-1.9596897714938326,0.25104125319415116,-0.41541423178478454,-0.3101481809331344,-0.2498178885931349,-1.1634807732392851,-0.6063256197236163,0.20341309593530288,-2.2633947730801482,-0.8329363663676823,-1.3499914304740406,-5.632342089248425,-3.5277134585941026,-0.49714436679773183,-5.211777730063722,-2.119393404337619,-1.4102925368907622,-0.16199800155059052,-3.009786450428175,-2.6701508760568315,-2.7638950064051255,-0.6885209324611296,-5.862919768861019,-3.902548269311298,-1.2674486693273999,-1.025966050421061,0.33299346344866154,-3.017741176265879,-0.5880340026725387,-3.181758538066305,-2.905161794323589,-1.1714040166927806,-0.7007184736102119,-0.8298893826812793,-3.3608701908525562,-0.7779018167540807,-1.247435345085162,-0.9853532549415267,-1.7266015922188396,-0.7762955727885819,0.5754544283666885,-1.2823897967612277],"z":[-2.0370676162595966,-1.2005958176307903,-0.36247116895894194,-1.6261487379699624,-1.0996709188952143,4.014267106935594,0.9618582725853637,-1.1333572356310353,2.1842696058922546,-0.5547619899111278,0.6301104936822146,-0.10269069740118118,-0.21242212671675345,-1.0354513379483994,-0.6536745686123384,-1.2583811894314965,-0.608018718598367,-0.4744863914184463,-0.43064936596071673,-0.1859773626224542,-1.2359771458399256,-0.7219135076168488,0.5619128972977429,0.6565431857434813,-0.06519453699526095,3.5691657874140854,0.44090739657919525,2.274293060152564,0.057368405745547975,-0.7141869381447022,0.29679365620008125,2.6640702128658695,0.8764113998793153,-0.9846854726618729,-0.5133295724039564,3.426722393250092,-1.1824305077461104,-1.1981915464283184,1.4690677520344626,-0.7988378083308632,1.006254843056117,-0.8930711815953559,-0.5807630711709432,0.5327011191064279,-0.14808540209293197,-0.0637908942055278,0.050752701284713,-1.8543320309032503,0.055912473743910125,4.380530540770261,-1.0266870401694288,-0.871909330931008,-0.8237737527608475,-0.41915459454556836,0.8276349972523795,0.056064606433759386,0.7731222608016604,-0.661047784137472,-1.2130154668767095,-1.2818651719177452,-0.8836975393096501,-0.5753759798360875,1.4070297342841414,0.6526139921254921,0.8762473026852273,-0.08964845034949309,-0.3868852644938679,-1.1222088848305483,-1.6606429180192204,-1.5319209155549012,1.73759401794685,3.0159888465104387,-1.4477025063555309,-1.137238627543718,-1.4330035461517865,-0.6516411169789167,-0.5868999375101223,-0.6864747308452774,-0.18924677143935814,-0.9567809870568996,-0.14680169095977247,-1.1118018731551738,-0.9385517677785034,-1.07772903564332,0.3030472437455669,-0.3088928716504975,1.2888069195863618,0.8498425328614645,-0.8355831718461862,-0.8123582044630867,-0.17019061351247608,0.5569431504160156,1.6610070616601615,-1.313839048515477,-1.5918217534805705,1.1242393682101448,1.0841937115956315,0.7051989812487259,-0.548581203140464,-1.326119907745121,0.9182242524375778,-0.991629382858922,-0.42634489771731876,-1.5819823933035373,-1.2731739012101286,-0.151311788731816,-0.9165931767135131,0.05010448600143359,3.9153973698610347,0.17688497214414012,-1.2323321346910436,3.4946527327401693,0.7958023286255593,-1.6644388775077206,1.138245335162604,-1.4582565274860686,-2.287990249102054,-0.3852429191412836,-0.3077927603818774,-1.1510899978455835,-1.059970202035502,1.734616794580042,0.29811755841005394,-1.0223554502746621,-0.8735069739076785,-1.2485291918490837,-0.6913386218482186,3.2490642455998278,-0.1018627681442774,2.683600922213336,1.6736876056313728,-1.2601232249542897,-0.6865380581562727,-1.6307483743516789,-0.5398971505209612,-1.6150263536665057,0.5427642430348527,-0.4626075091214027,0.570543882928431,-0.6814199587271479,2.456866806745565,-1.3731043543263748,-0.7391732682781942,2.0536385520906464,2.488313595402391,2.7033014702082356,0.8459556355451614,-1.3559834245788582,1.2414543464867853,-0.9640287742712689,-1.0425629309985545,3.366490487917941,-0.9861823243603073,1.1593957643776744,-0.8393451248488976,-0.31947843383260516,2.350809064899767,-0.6785194698576537,-0.3026682014098823,-0.2825444732764215,0.04327937287544249,-1.0907173948144016,-1.3198232970160968,-1.5049886989931778,-1.212705063469156,-1.0810571615191624,-1.4186245167343485,2.26717484254677,1.2342231895433984,-0.7474121419191581,1.1183735989067305,-0.798818295789265,-1.098685467693699,-0.8478140645795647,0.04278797133864061,-0.9090316539066873,-0.914150506614277,-1.0893705204388857,-0.9469961248815323,0.5678673840372358,-1.1651151234451507,-0.33906982311381967,3.0209838533625795,-1.5960819827054884,0.45463982777489403,0.9106009695353058,-0.7791922607464363,-1.1902777200508015,-0.5779965630157932,0.20369232554725725,-0.11050991517682973,-0.8633968400033408,-0.8025283372472045,-1.259400897413451,-0.5433700709351704,-1.1808809251613728,-1.2566320401306277,-0.7326254913046033,3.9822724693225973,-1.5031102177780644,-1.0527294170033124,-0.9666321214878066,0.7592003499384461,0.8061014703001537,-2.0050520576419664,-0.8584385129975075,-1.4329368818612431,-0.47083014096316544,-0.05997301145869079,-0.47858979884475955,0.0031967563592123077,-0.9142053774613813,0.5873922198750423,1.6229101763800757,-0.6407648064974918,-1.552221151019529,-1.3743069271533044,-1.3960282599354967,-1.183970979239369,-1.183050221351436,0.9824397939311312,-0.3273158706298971,-0.7401730840648467,-1.5473316057295319,2.051112022681943,-1.242993617163465,0.14337246558161879,-0.7853711162533792,-0.6833298015416034,-0.7761731554333268,-0.3966590217772041,0.08247522566327355,-0.4977446604760023,-0.5082729099708101,-1.061036817715895,-0.529011363340697,-0.07573875625844577,2.090046177890535,-1.9378947792910954,-0.9491704441019593,1.426364374602088,1.3313159981977796,0.018176923050830254,-0.9862818782204299,-1.3213165129342572,-0.9281528288354117,-0.3847616763943351,-0.935275833475852,-0.6971484145138817,0.46938134445104457,-1.4863612248243911,-0.7407155698827206,-1.2502742476430497,-0.38479870806665095,-0.12738398814330615,1.9569994802317392,-0.2232813444501857,-1.1700429859570676,-0.9707239495558178,-0.8084887471171173,-1.4196339789467383,-0.14365519187457357,-0.9474053195537244,-2.0580375283253596,-2.268674142873014,-0.6486400175822528,-0.7542564206921974,-0.5613061816907471,-0.49774437162638796,1.173903957258014,-0.94746007161211,-1.3353862618431378,-1.6436920024764663,-1.3025025526623624,-0.49600163476621323,-1.4266719494889546,-0.5089537902105246,1.0727952902738815,-0.37864962796014084,-1.5206190674164228,-0.24001077261093748,1.2763627470099734,1.750277496777948,-1.4118206694203206,0.5364742871281394,-0.8535376689603223,-1.112442274461787,-1.2538003918716676,0.2730566242382387,0.21003156412608,-0.05725667632534983,-0.7290345805549953,-1.2380083028391755,0.30233690361954324,0.008923342079302307,-0.3622084230787824,-2.166251845741825,-0.694328840971569,-1.1120777266770614,2.5592557417167385,0.3871753112409245,-1.1834419449196438,-0.08667815746043782,1.3669279764965425,0.01352173815377141,-1.0691888010505268,-0.7919990061155517,-0.09545533574315454,-1.3448691193425115,1.2186000749215975,1.21858423500769,-0.31754076889671956,1.0726169253597866,-0.9494294258479409,0.10217807917419776,1.2542894499798014,1.8687727521667223,-0.2435320722048295,-2.057728425118569,2.650849842761136,0.9050351410189169,0.4348223391322543,-0.90777663872563,-1.1561718483444758,2.0821328885290615,-0.7610051732469797,-0.27988168704214883,-0.21990515810843317,-0.6458585667280548,-0.47727114780378116,0.5032457112427619,-1.0406762862432266,0.03974070509844233,-1.3020260231587342,-0.9734693787483819,-1.1312559206153179,1.34961769757292,-1.2096398660246421,-0.053370684357083385,0.831134665194549,-1.3636332080858116,0.4239013714594735,2.242901667850205,-0.16227489299909015,-0.6605692031861631,-1.0362707949772065,-1.4775813650245138,-1.009658211092598,-0.5400563427255581,1.5217515271882822,-1.468776477336405,-0.6034409843322378,-1.4961559198258514,1.3022724550091018,-1.3028378659936226,1.4654694016921292,0.574569851345474,-1.4764931848810126,-0.11826758112330228,-1.3678813612493286,-1.073161091347262,-0.2687229513777664,-1.093968425614626,1.623970477359648,1.941043000794048,-0.45260988985027556,-1.3457823031275824,-1.633359211871704,-0.176516690209174,0.3310197134464476,-0.6670144268306369,-1.6023351125772591,-0.7307247185512257,-0.944506915460129,-1.1413218080178573,1.530137062769809,-0.1418918264864756,-0.952934284281165,1.880631419620502,-0.5530832260422759,-0.412715288187325,-1.4211544896958146,-0.9287349529234462,-0.31387558801895943,-0.9198119258126287,-1.4355444695277195,-1.077590863597566,-0.5749741101526452,1.5578894941132744,-0.738834220135872,-1.1827493078634337,-1.080203363771564,-1.2314433834703802,-1.1382497859800165,-1.5613747874360915,0.11712283465553772,1.2849572991176543,-0.12614708547152212,-0.46869955072268626,0.5874246032251147,-1.070228866094616,0.38338733473534203,-0.03034256621561565,-1.416099778227443,-0.852074492565974,2.209875202036394,0.6415664477402121,-0.12951296178934793,1.4196293685615053,-0.38943224447880975,0.5255209540714105,-0.6364475130657281,1.1172225094190233,-1.883716428956584,1.5344028952397588,-0.970222119258872,1.8187344447107778,0.35264211888160335,1.4725243727464026,0.6229113090248931,-1.1902210787733443,1.0403345520703169,0.4917710707952045,-1.325287587934525,0.04597928003704212,0.11326391743559178,2.750080081660945,-0.6673375972793106,-0.45979906452232805,0.7088391427999617,-1.2192628396162142,-1.3562019619115326,1.96955647040654,0.10827674590293929,-1.341835997740269,-0.9839780474466389,-1.0615297841544367,-0.6516115445987061,-0.4372746481989732,-1.8119486801268827,-1.5363922698574881,-1.0245344493748325,-0.17648380268930491,-0.4940248860453171,-1.1963000549328324,-1.0404746117999,0.07880228331246959,-1.435395941487854,-0.9506967006536369,-1.7523432761447557,-1.2400872735972517,-0.2778021338113824,0.5292076764694955,-0.9199367765708731,3.8932743608937144,1.253100769681501,-0.7705871309509181,3.1198296911737633,0.14436537118081097,-1.014678388836717,-0.6700274982280996,1.5422670605974467,0.9069031249060769,0.6747811741854274,-0.9709473852336523,2.860990829610701,2.059907115374288,-1.2040718447380292,-1.639643022685576,-1.5331424832707388,1.1059593969384471,-1.37482120258221,0.604488980453316,-0.41437887679072577,-1.1540957696018797,-1.444854242037701,-1.0200230717094692,1.5636487772661936,-1.1185160887090537,-0.7335478554211574,-1.3840922774331317,-0.4751529187204151,-0.8804152454543704,-0.9609273437602638,-0.6280644043601773],"type":"scatter3d"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"},"margin":{"b":0,"l":0,"r":0,"t":30}}},"scene":{"domain":{"x":[0.0,1.0],"y":[0.0,1.0]},"xaxis":{"title":{"text":"z1"}},"yaxis":{"title":{"text":"z2"}},"zaxis":{"title":{"text":"z3"}}},"legend":{"title":{"text":"class"},"tracegroupgap":0},"height":800,"width":1000},                        {"responsive": true}                    ).then(function(){
+                            
+var gd = document.getElementById('b5087929-d2cd-4b60-bfb4-bc78a2c89475');
+var x = new MutationObserver(function (mutations, observer) {{
+        var display = window.getComputedStyle(gd).display;
+        if (!display || display === 'none') {{
+            console.log([gd, 'removed!']);
+            Plotly.purge(gd);
+            observer.disconnect();
+        }}
+}});
+
+// Listen for the removal of the full notebook cells
+var notebookContainer = gd.closest('#notebook-container');
+if (notebookContainer) {{
+    x.observe(notebookContainer, {childList: true});
+}}
+
+// Listen for the clearing of the current output cell
+var outputEl = gd.closest('.output');
+if (outputEl) {{
+    x.observe(outputEl, {childList: true});
+}}
+
+                        })                };                });            </script>        </div>
+</div>
+</div>
+</section>
+</section>
+<section id="why-perform-pca" class="level2" data-number="25.8">
+<h2 data-number="25.8" class="anchored" data-anchor-id="why-perform-pca"><span class="header-section-number">25.8</span> Why Perform PCA</h2>
+<p>As we saw in the demos, we often perform PCA during the Exploratory Data Analysis (EDA) stage of our data science lifecycle (if we already know what to model, we probably don’t need PCA!). It helps us with:</p>
+<ul>
+<li>Visually identifying clusters of similar observations in high dimensions.</li>
+<li>Removing irrelevant dimensions if we suspect that the dataset is inherently low rank. For example, if the columns are collinear: there are many attributes but only a few mostly determine the rest through linear associations.</li>
+<li>Finding a small basis for representing variations in complex things, e.g., images, genes.</li>
+<li>Reducing the number of dimensions to make some computations cheaper.</li>
+</ul>
+<section id="why-pca-then-model" class="level3" data-number="25.8.1">
+<h3 data-number="25.8.1" class="anchored" data-anchor-id="why-pca-then-model"><span class="header-section-number">25.8.1</span> Why PCA, then Model?</h3>
+<ol type="1">
+<li>Reduces dimensionality, allowing us to speed up training and reduce the number of features, etc.</li>
+<li>Avoids multicollinearity in the new features created (i.e.&nbsp;the principal components)</li>
+</ol>
+<center>
+<img src="images/slide21.png" alt="slide21" width="500">
+</center>
+</section>
+</section>
+<section id="bonus-applications-of-pca" class="level2" data-number="25.9">
+<h2 data-number="25.9" class="anchored" data-anchor-id="bonus-applications-of-pca"><span class="header-section-number">25.9</span> (Bonus) Applications of PCA</h2>
+<section id="pca-in-biology" class="level3" data-number="25.9.1">
+<h3 data-number="25.9.1" class="anchored" data-anchor-id="pca-in-biology"><span class="header-section-number">25.9.1</span> PCA in Biology</h3>
+<p>PCA is commonly used in biomedical contexts, which have many named variables! It can be used to:</p>
+<ol type="1">
+<li>Cluster data (<a href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2680-1">Paper 1</a>, <a href="https://www.science.org/doi/10.1126/scirobotics.abk2378">Paper 2</a>).</li>
+<li>Identify correlated variables (<a href="https://docs.google.com/presentation/d/1-aDu0ILCkPx3iCcJGB3YXci-L4g90Q6AarXU6wffLB8/edit#slide=id.g62cb86badb_0_1128">interpret</a> rows of <span class="math inline">\(V^{T}\)</span> as linear coefficients) (<a href="https://www.nature.com/articles/s41598-017-05714-1">Paper 3</a>). Uses <a href="https://www.google.com/url?q=https://www.geo.fu-berlin.de/en/v/soga/Geodata-analysis/Principal-Component-Analysis/principal-components-basics/Interpretation-and-visualization/index.html%23:~:text%3DThe%2520biplot%2520is%2520a%2520very,in%2520a%2520single%2520biplot%2520display.%26text%3DThe%2520plot%2520shows%2520the%2520observations,principal%2520components%2520(synthetic%2520variables).&amp;sa=D&amp;source=editors&amp;ust=1682131633152964&amp;usg=AOvVaw2H9SOeMP5kUS890Fkhfthx">biplots</a>.</li>
+</ol>
+</section>
+</section>
+<section id="bonus-pca-vs.-regression" class="level2" data-number="25.10">
+<h2 data-number="25.10" class="anchored" data-anchor-id="bonus-pca-vs.-regression"><span class="header-section-number">25.10</span> (Bonus) PCA vs.&nbsp;Regression</h2>
+<section id="regression-minimizing-horizontalverticle-error" class="level3" data-number="25.10.1">
+<h3 data-number="25.10.1" class="anchored" data-anchor-id="regression-minimizing-horizontalverticle-error"><span class="header-section-number">25.10.1</span> Regression: Minimizing Horizontal/Verticle Error</h3>
+<p>Suppose we know the child mortality rate of a given country. Linear regression tries to predict the fertility rate from the mortality rate; for example, if the mortality is 6, we might guess the fertility is near 4. The regression line tells us the “best” prediction of fertility given all possible mortality values by minimizing the root mean squared error. See the vertical red lines (note that only some are shown).</p>
+<center>
+<img src="images/lin_reg.png" width="400vw">
+</center>
+<p><br> We can also perform a regression in the reverse direction. That is, given fertility, we try to predict mortality. In this case, we get a different regression line that minimizes the root mean squared length of the horizontal lines.</p>
+<center>
+<img src="images/lin_reg_reverse.png" width="400vw">
+</center>
+</section>
+<section id="svd-minimizing-perpendicular-error" class="level3" data-number="25.10.2">
+<h3 data-number="25.10.2" class="anchored" data-anchor-id="svd-minimizing-perpendicular-error"><span class="header-section-number">25.10.2</span> SVD: Minimizing Perpendicular Error</h3>
+<p>The rank-1 approximation is close but not the same as the mortality regression line. Instead of minimizing <em>horizontal</em> or <em>vertical</em> error, our rank-1 approximation minimizes the error <em>perpendicular</em> to the subspace onto which we’re projecting. That is, SVD finds the line such that if we project our data onto that line, the error between the projection and our original data is minimized. The similarity of the rank-1 approximation and the fertility was just a coincidence. Looking at adiposity and bicep size from our body measurements dataset, we see the 1D subspace onto which we are projecting is between the two regression lines.</p>
+<center>
+<img src="images/rank1.png" width="400vw">
+</center>
+</section>
+<section id="beyond-1d-and-2d" class="level3" data-number="25.10.3">
+<h3 data-number="25.10.3" class="anchored" data-anchor-id="beyond-1d-and-2d"><span class="header-section-number">25.10.3</span> Beyond 1D and 2D</h3>
+<p>Even in higher dimensions, the idea behind principal components is the same! Suppose we have 30-dimensional data and decide to use the first 5 principal components. Our procedure minimizes the error between the original 30-dimensional data and the projection of that 30-dimensional data onto the “best” 5-dimensional subspace. See <a href="https://eecs189.org/docs/notes/n10.pdf">CS 189 Note 10</a> for more details.</p>
+</section>
+</section>
+<section id="bonus-automatic-factorization" class="level2" data-number="25.11">
+<h2 data-number="25.11" class="anchored" data-anchor-id="bonus-automatic-factorization"><span class="header-section-number">25.11</span> (Bonus) Automatic Factorization</h2>
+<p>One key fact to remember is that the decomposition is not arbitrary. The <em>rank</em> of a matrix limits how small our inner dimensions can be if we want to perfectly recreate our matrix. The proof for this is out of scope.</p>
+<p>Even if we know we have to factorize our matrix using an inner dimension of <span class="math inline">\(R\)</span>, that still leaves a large space of solutions to traverse. What if we have a procedure to automatically factorize a rank <span class="math inline">\(R\)</span> matrix into an <span class="math inline">\(R\)</span>-dimensional representation with some transformation matrix?</p>
+<ul>
+<li>Lower dimensional representation avoids redundant features.</li>
+<li>Imagine a 1000-dimensional dataset: If the rank is only 5, it’s much easier to do EDA after this mystery procedure.</li>
+</ul>
+<p>What if we wanted a 2D representation? It’s valuable to compress all of the data that is relevant into as few dimensions as possible in order to plot it efficiently. Some 2D matrices yield better approximations than others. How well can we do?</p>
+</section>
+<section id="bonus-proof-of-component-score" class="level2" data-number="25.12">
+<h2 data-number="25.12" class="anchored" data-anchor-id="bonus-proof-of-component-score"><span class="header-section-number">25.12</span> (Bonus) Proof of Component Score</h2>
+<p>The proof defining component score is out of scope for this class, but it is included below for your convenience.</p>
+<p><strong>Setup</strong>: Consider the design matrix <span class="math inline">\(X \in \mathbb{R}^{n \times d}\)</span>, where the <span class="math inline">\(j\)</span>-th column (corresponding to the <span class="math inline">\(j\)</span>-th feature) is <span class="math inline">\(x_j \in \mathbb{R}^n\)</span> and the element in row <span class="math inline">\(i\)</span>, column <span class="math inline">\(j\)</span> is <span class="math inline">\(x_{ij}\)</span>. Further, define <span class="math inline">\(\tilde{X}\)</span> as the <strong>centered</strong> design matrix. The <span class="math inline">\(j\)</span>-th column is <span class="math inline">\(\tilde{x}_j \in \mathbb{R}^n\)</span> and the element in row <span class="math inline">\(i\)</span>, column <span class="math inline">\(j\)</span> is <span class="math inline">\(\tilde{x}_{ij} = x_{ij} - \bar{x_j}\)</span>, where <span class="math inline">\(\bar{x_j}\)</span> is the mean of the <span class="math inline">\(x_j\)</span> column vector from the original <span class="math inline">\(X\)</span>.</p>
+<p><strong>Variance</strong>: Construct the <strong>covariance matrix</strong>: <span class="math inline">\(\frac{1}{n} \tilde{X}^T \tilde{X} \in \mathbb{R}^{d \times d}\)</span>. The <span class="math inline">\(j\)</span>-th element along the diagonal is the <strong>variance</strong> of the <span class="math inline">\(j\)</span>-th column of the original design matrix <span class="math inline">\(X\)</span>:</p>
+<p><span class="math display">\[\left( \frac{1}{n} \tilde{X}^T \tilde{X} \right)_{jj} = \frac{1}{n} \tilde{x}_j ^T \tilde{x}_j = \frac{1}{n} \sum_{i=i}^n (\tilde{x}_{ij} )^2 = \frac{1}{n} \sum_{i=i}^n (x_{ij} - \bar{x_j})^2\]</span></p>
+<p><strong>SVD</strong>: Suppose singular value decomposition of the <em>centered</em> design matrix <span class="math inline">\(\tilde{X}\)</span> yields <span class="math inline">\(\tilde{X} = U S V^T\)</span>, where <span class="math inline">\(U \in \mathbb{R}^{n \times d}\)</span> and <span class="math inline">\(V \in \mathbb{R}^{d \times d}\)</span> are matrices with orthonormal columns, and <span class="math inline">\(S \in \mathbb{R}^{d \times d}\)</span> is a diagonal matrix with singular values of <span class="math inline">\(\tilde{X}\)</span>.</p>
+<p><span class="math display">\[
+\begin{aligned}
+\tilde{X}^T \tilde{X} &amp;= (U S V^T )^T (U S V^T) \\
+&amp;= V S U^T U S V^T  &amp; (S^T = S) \\
+&amp;= V S^2 V^T &amp; (U^T U = I) \\
+\frac{1}{n} \tilde{X}^T \tilde{X} &amp;= \frac{1}{n} V S V^T =V \left( \frac{1}{n} S \right) V^T \\
+\frac{1}{n} \tilde{X}^T \tilde{X} V &amp;= V \left( \frac{1}{n} S \right) V^T V = V \left( \frac{1}{n} S \right) &amp; \text{(right multiply by }V \rightarrow V^T V = I \text{)} \\
+V^T \frac{1}{n} \tilde{X}^T \tilde{X} V &amp;= V^T V \left( \frac{1}{n} S \right) = \frac{1}{n} S &amp; \text{(left multiply by }V^T \rightarrow V^T V = I \text{)} \\
+\left( \frac{1}{n} \tilde{X}^T \tilde{X} \right)_{jj} &amp;= \frac{1}{n}S_j^2  &amp; \text{(Define }S_j\text{ as the} j\text{-th singular value)} \\
+\frac{1}{n} S_j^2 &amp;= \frac{1}{n} \sum_{i=i}^n (x_{ij} - \bar{x_j})^2
+\end{aligned}
+\]</span></p>
+<p>The last line defines the <span class="math inline">\(j\)</span>-th component score.</p>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../pca_1/pca_1.html" class="pagination-link" aria-label="PCA I">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../clustering/clustering.html" class="pagination-link" aria-label="Clustering">
+        <span class="nav-page-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/probability_1/images/discrete_continuous.png b/docs/probability_1/images/discrete_continuous.png
new file mode 100644
index 000000000..6c0100129
Binary files /dev/null and b/docs/probability_1/images/discrete_continuous.png differ
diff --git a/docs/probability_1/images/distribution.png b/docs/probability_1/images/distribution.png
new file mode 100644
index 000000000..ba92219a2
Binary files /dev/null and b/docs/probability_1/images/distribution.png differ
diff --git a/docs/probability_1/images/exp_var.png b/docs/probability_1/images/exp_var.png
new file mode 100644
index 000000000..ca2af61dd
Binary files /dev/null and b/docs/probability_1/images/exp_var.png differ
diff --git a/docs/probability_1/images/probability_areas.png b/docs/probability_1/images/probability_areas.png
new file mode 100644
index 000000000..f9acbf7a3
Binary files /dev/null and b/docs/probability_1/images/probability_areas.png differ
diff --git a/docs/probability_1/images/rv.png b/docs/probability_1/images/rv.png
new file mode 100644
index 000000000..3a5f70bdf
Binary files /dev/null and b/docs/probability_1/images/rv.png differ
diff --git a/docs/probability_1/images/transformation.png b/docs/probability_1/images/transformation.png
new file mode 100644
index 000000000..d6c454a56
Binary files /dev/null and b/docs/probability_1/images/transformation.png differ
diff --git a/docs/probability_1/images/yz.png b/docs/probability_1/images/yz.png
new file mode 100644
index 000000000..e0ca34e1f
Binary files /dev/null and b/docs/probability_1/images/yz.png differ
diff --git a/docs/probability_1/images/yz_distribution.png b/docs/probability_1/images/yz_distribution.png
new file mode 100644
index 000000000..b208edafc
Binary files /dev/null and b/docs/probability_1/images/yz_distribution.png differ
diff --git a/docs/probability_1/probability_1.html b/docs/probability_1/probability_1.html
new file mode 100644
index 000000000..dce1148b8
--- /dev/null
+++ b/docs/probability_1/probability_1.html
@@ -0,0 +1,1859 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>17&nbsp; Random Variables – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../probability_2/probability_2.html" rel="next">
+<link href="../cv_regularization/cv_reg.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../probability_1/probability_1.html"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Random Variables</h2>
+   
+  <ul>
+  <li><a href="#random-variables-and-distributions" id="toc-random-variables-and-distributions" class="nav-link active" data-scroll-target="#random-variables-and-distributions"><span class="header-section-number">17.1</span> Random Variables and Distributions</a>
+  <ul>
+  <li><a href="#example-tossing-a-coin" id="toc-example-tossing-a-coin" class="nav-link" data-scroll-target="#example-tossing-a-coin"><span class="header-section-number">17.1.1</span> Example: Tossing a Coin</a></li>
+  <li><a href="#example-sampling-data-100-students" id="toc-example-sampling-data-100-students" class="nav-link" data-scroll-target="#example-sampling-data-100-students"><span class="header-section-number">17.1.2</span> Example: Sampling Data 100 Students</a></li>
+  <li><a href="#distributions" id="toc-distributions" class="nav-link" data-scroll-target="#distributions"><span class="header-section-number">17.1.3</span> Distributions</a></li>
+  </ul></li>
+  <li><a href="#expectation-and-variance" id="toc-expectation-and-variance" class="nav-link" data-scroll-target="#expectation-and-variance"><span class="header-section-number">17.2</span> Expectation and Variance</a>
+  <ul>
+  <li><a href="#expectation" id="toc-expectation" class="nav-link" data-scroll-target="#expectation"><span class="header-section-number">17.2.1</span> Expectation</a>
+  <ul>
+  <li><a href="#example-1-coin-toss" id="toc-example-1-coin-toss" class="nav-link" data-scroll-target="#example-1-coin-toss"><span class="header-section-number">17.2.1.1</span> Example 1: Coin Toss</a></li>
+  <li><a href="#example-2" id="toc-example-2" class="nav-link" data-scroll-target="#example-2"><span class="header-section-number">17.2.1.2</span> Example 2</a></li>
+  </ul></li>
+  <li><a href="#variance" id="toc-variance" class="nav-link" data-scroll-target="#variance"><span class="header-section-number">17.2.2</span> Variance</a></li>
+  <li><a href="#example-die" id="toc-example-die" class="nav-link" data-scroll-target="#example-die"><span class="header-section-number">17.2.3</span> Example: Die</a></li>
+  </ul></li>
+  <li><a href="#sums-of-random-variables" id="toc-sums-of-random-variables" class="nav-link" data-scroll-target="#sums-of-random-variables"><span class="header-section-number">17.3</span> Sums of Random Variables</a>
+  <ul>
+  <li><a href="#properties-of-expectation" id="toc-properties-of-expectation" class="nav-link" data-scroll-target="#properties-of-expectation"><span class="header-section-number">17.3.1</span> Properties of Expectation</a></li>
+  <li><a href="#properties-of-variance" id="toc-properties-of-variance" class="nav-link" data-scroll-target="#properties-of-variance"><span class="header-section-number">17.3.2</span> Properties of Variance</a></li>
+  <li><a href="#covariance-and-correlation" id="toc-covariance-and-correlation" class="nav-link" data-scroll-target="#covariance-and-correlation"><span class="header-section-number">17.3.3</span> Covariance and Correlation</a></li>
+  <li><a href="#equal-vs.-identically-distributed-vs.-i.i.d" id="toc-equal-vs.-identically-distributed-vs.-i.i.d" class="nav-link" data-scroll-target="#equal-vs.-identically-distributed-vs.-i.i.d"><span class="header-section-number">17.3.4</span> Equal vs.&nbsp;Identically Distributed vs.&nbsp;i.i.d</a></li>
+  <li><a href="#example-bernoulli-random-variable" id="toc-example-bernoulli-random-variable" class="nav-link" data-scroll-target="#example-bernoulli-random-variable"><span class="header-section-number">17.3.5</span> Example: Bernoulli Random Variable</a></li>
+  <li><a href="#example-binomial-random-variable" id="toc-example-binomial-random-variable" class="nav-link" data-scroll-target="#example-binomial-random-variable"><span class="header-section-number">17.3.6</span> Example: Binomial Random Variable</a></li>
+  <li><a href="#summary" id="toc-summary" class="nav-link" data-scroll-target="#summary"><span class="header-section-number">17.3.7</span> Summary</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></h1><button type="button" class="btn code-tools-button" id="quarto-code-tools-source"><i class="bi"></i> Code</button></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Define a random variable in terms of its distribution</li>
+<li>Compute the expectation and variance of a random variable</li>
+<li>Gain familiarity with the Bernoulli and binomial random variables</li>
+</ul>
+</div>
+</div>
+</div>
+<p>In the past few lectures, we’ve examined the role of complexity in influencing model performance. We’ve considered model complexity in the context of a tradeoff between two competing factors: model variance and training error.</p>
+<p>So far, our analysis has been mostly qualitative. We’ve acknowledged that our choice of model complexity needs to strike a balance between model variance and training error, but we haven’t yet discussed <em>why</em> exactly this tradeoff exists.</p>
+<p>To better understand the origin of this tradeoff, we will need to dive into <strong>random variables</strong>. The next two course notes on probability will be a brief digression from our work on modeling so we can build up the concepts needed to understand this so-called <strong>bias-variance tradeoff</strong>. In specific, we will cover:</p>
+<ol type="1">
+<li>Random Variables: introduce random variables, considering the concepts of expectation, variance, and covariance</li>
+<li>Estimators, Bias, and Variance: re-express the ideas of model variance and training error in terms of random variables and use this new perspective to investigate our choice of model complexity</li>
+</ol>
+<p>We’ll go over just enough probability to help you understand its implications for modeling, but if you want to go a step further, take Data 140, CS 70, and/or EECS 126.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-2-contents" aria-controls="callout-2" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Data 8 Recap
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-2" class="callout-2-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<p>Recall the following concepts from Data 8:</p>
+<ol type="1">
+<li><p>Sample mean: The mean of the random sample</p></li>
+<li><p>Central Limit Theorem: If you draw a large random sample with replacement, then, regardless of the population distribution, the probability distribution of the sample mean</p>
+<ol type="a">
+<li><p>is roughly normal</p></li>
+<li><p>is centered at the population mean</p></li>
+<li><p>has an <span class="math inline">\(SD = \frac{\text{population SD}}{\sqrt{\text{sample size}}}\)</span></p></li>
+</ol></li>
+</ol>
+</div>
+</div>
+</div>
+<p>In Data 100, we want to understand the broader relationship between the following:</p>
+<ul>
+<li><strong>Population parameter</strong>: a number that describes something about the population</li>
+<li><strong>Sample statistic</strong>: an estimate of the number computed on a sample</li>
+</ul>
+<section id="random-variables-and-distributions" class="level2" data-number="17.1">
+<h2 data-number="17.1" class="anchored" data-anchor-id="random-variables-and-distributions"><span class="header-section-number">17.1</span> Random Variables and Distributions</h2>
+<p>Suppose we generate a set of random data, like a random sample from some population. A <strong>random variable</strong> is a <em>function</em> from the outcome of a random event to a number.</p>
+<p>It is <em>random</em> since our sample was drawn at random; it is <em>variable</em> because its exact value depends on how this random sample came out. As such, the domain or input of our random variable is all possible outcomes for some random event in a <em>sample space</em>, and its range or output is the real number line. We typically denote random variables with uppercase letters, such as <span class="math inline">\(X\)</span> or <span class="math inline">\(Y\)</span>. In contrast, note that regular variables tend to be denoted using lowercase letters. Sometimes we also use uppercase letters to refer to matrices (such as your design matrix <span class="math inline">\(\mathbb{X}\)</span>), but we will do our best to be clear with the notation.</p>
+<p>To motivate what this (rather abstract) definition means, let’s consider the following examples:</p>
+<section id="example-tossing-a-coin" class="level3" data-number="17.1.1">
+<h3 data-number="17.1.1" class="anchored" data-anchor-id="example-tossing-a-coin"><span class="header-section-number">17.1.1</span> Example: Tossing a Coin</h3>
+<p>Let’s formally define a fair coin toss. A fair coin can land on heads (<span class="math inline">\(H\)</span>) or tails (<span class="math inline">\(T\)</span>), each with a probability of 0.5. With these possible outcomes, we can define a random variable <span class="math inline">\(X\)</span> as: <span class="math display">\[X = \begin{cases}
+      1, \text{if the coin lands heads} \\
+      0, \text{if the coin lands tails}
+   \end{cases}\]</span></p>
+<p><span class="math inline">\(X\)</span> is a function with a domain, or input, of <span class="math inline">\(\{H, T\}\)</span> and a range, or output, of <span class="math inline">\(\{1, 0\}\)</span>. In practice, while we don’t use the following function notation, you could write the above as <span class="math display">\[X = \begin{cases}  X(H) = 1 \\ X(T) = 0 \end{cases}\]</span></p>
+</section>
+<section id="example-sampling-data-100-students" class="level3" data-number="17.1.2">
+<h3 data-number="17.1.2" class="anchored" data-anchor-id="example-sampling-data-100-students"><span class="header-section-number">17.1.2</span> Example: Sampling Data 100 Students</h3>
+<p>Suppose we draw a random sample <span class="math inline">\(s\)</span> of size 3 from all students enrolled in Data 100.</p>
+<p>We can define <span class="math inline">\(Y\)</span> as the number of data science students in our sample. Its domain is all possible samples of size 3, and its range is <span class="math inline">\(\{0, 1, 2, 3\}\)</span>.</p>
+<p align="center">
+<img src="images/rv.png" alt="rv" width="600" class="center">
+</p>
+<p>Note that we can use random variables in mathematical expressions to create new random variables.</p>
+<p>For example, let’s say we sample 3 students at random from lecture and look at their midterm scores. Let <span class="math inline">\(X_1\)</span>, <span class="math inline">\(X_2\)</span>, and <span class="math inline">\(X_3\)</span> represent each student’s midterm grade.</p>
+<p>We can use these random variables to create a new random variable, <span class="math inline">\(Y\)</span>, which represents the average of the 3 scores: <span class="math inline">\(Y = (X_1 + X_2 + X_3)/3\)</span>.</p>
+<p>As we’re creating this random variable, a few questions arise:</p>
+<ul>
+<li>What can we say about the distribution of <span class="math inline">\(Y\)</span>?</li>
+<li>How does it depend on the distribution of <span class="math inline">\(X_1\)</span>, <span class="math inline">\(X_2\)</span>, and <span class="math inline">\(X_3\)</span>?</li>
+</ul>
+<p>But, what exactly is a distribution? Let’s dive into this!</p>
+</section>
+<section id="distributions" class="level3" data-number="17.1.3">
+<h3 data-number="17.1.3" class="anchored" data-anchor-id="distributions"><span class="header-section-number">17.1.3</span> Distributions</h3>
+<p>To define any random variable <span class="math inline">\(X\)</span>, we need to be able to specify 2 things:</p>
+<ol type="1">
+<li><strong>Possible values</strong>: the set of values the random variable can take on.</li>
+<li><strong>Probabilities</strong>: the set of probabilities describing how the total probability of 100% is split over the possible values.</li>
+</ol>
+<p>If <span class="math inline">\(X\)</span> is discrete (has a finite number of possible values), the probability that a random variable <span class="math inline">\(X\)</span> takes on the value <span class="math inline">\(x\)</span> is given by <span class="math inline">\(P(X=x)\)</span>, and probabilities must sum to 1: <span class="math inline">\(\sum_{\text{all } x} P(X=x) = 1\)</span>,</p>
+<p>We can often display this using a <strong>probability distribution table</strong>. In the coin toss example, the probability distribution table of <span class="math inline">\(X\)</span> is given by.</p>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th><span class="math inline">\(x\)</span></th>
+<th><span class="math inline">\(P(X=x)\)</span></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>0</td>
+<td><span class="math inline">\(\frac{1}{2}\)</span></td>
+</tr>
+<tr class="even">
+<td>1</td>
+<td><span class="math inline">\(\frac{1}{2}\)</span></td>
+</tr>
+</tbody>
+</table>
+<p>The <strong>distribution</strong> of a random variable <span class="math inline">\(X\)</span> describes how the total probability of 100% is split across all the possible values of <span class="math inline">\(X\)</span>, and it fully defines a random variable. If you know the distribution of a random variable you can:</p>
+<ul>
+<li>compute properties of the random variables and derived variables</li>
+<li>simulate the random variables by randomly picking values of <span class="math inline">\(X\)</span> according to its distribution using <code>np.random.choice</code>, <code>df.sample</code>, or <code>scipy.stats.&lt;dist&gt;.rvs(...)</code></li>
+</ul>
+<p>The distribution of a discrete random variable can also be represented using a histogram. If a variable is <strong>continuous</strong>, meaning it can take on infinitely many values, we can illustrate its distribution using a density curve.</p>
+<p align="center">
+<img src="images/discrete_continuous.png" alt="discrete_continuous" width="700">
+</p>
+<p>We often don’t know the (true) distribution and instead compute an empirical distribution. If you flip a coin 3 times and get {H, H, T}, you may ask —— what is the probability that the coin will land heads? We can come up with an <strong>empirical estimate</strong> of <span class="math inline">\(\frac{2}{3}\)</span>, though the true probability might be <span class="math inline">\(\frac{1}{2}\)</span>.</p>
+<p>Probabilities are areas. For discrete random variables, the <em>area of the red bars</em> represents the probability that a discrete random variable <span class="math inline">\(X\)</span> falls within those values. For continuous random variables, the <em>area under the curve</em> represents the probability that a discrete random variable <span class="math inline">\(Y\)</span> falls within those values.</p>
+<p align="center">
+<img src="images/probability_areas.png" alt="discrete_continuous" width="600">
+</p>
+<p>If we sum up the total area of the bars/under the density curve, we should get 100%, or 1.</p>
+<p>We can show the distribution of <span class="math inline">\(Y\)</span> in the following tables. The table on the left lists all possible samples of <span class="math inline">\(s\)</span> and the number of times they can appear (<span class="math inline">\(Y(s)\)</span>). We can use this to calculate the values for the table on the right, a <strong>probability distribution table</strong>.</p>
+<p align="center">
+<img src="images/distribution.png" alt="distribution" width="600">
+</p>
+<p>Rather than fully write out a probability distribution or show a histogram, there are some common distributions that come up frequently when doing data science. These distributions are specified by some <strong>parameters</strong>, which are constants that specify the shape of the distribution. In terms of notation, the ‘~’ means “has the probability distribution of”.</p>
+<p>These common distributions are listed below:</p>
+<ol type="1">
+<li>Bernoulli(<span class="math inline">\(p\)</span>): If <span class="math inline">\(X\)</span> ~ Bernoulli(<span class="math inline">\(p\)</span>), then <span class="math inline">\(X\)</span> takes on a value 1 with probability <span class="math inline">\(p\)</span>, and 0 with probability <span class="math inline">\(1 - p\)</span>. Bernoulli random variables are also termed the “indicator” random variables.</li>
+<li>Binomial(<span class="math inline">\(n\)</span>, <span class="math inline">\(p\)</span>): If <span class="math inline">\(X\)</span> ~ Binomial(<span class="math inline">\(n\)</span>, <span class="math inline">\(p\)</span>), then <span class="math inline">\(X\)</span> counts the number of 1s in <span class="math inline">\(n\)</span> independent Bernoulli(<span class="math inline">\(p\)</span>) trials.</li>
+<li>Categorical(<span class="math inline">\(p_1, ..., p_k\)</span>) of values: The probability of each value is 1 / (number of possible values).</li>
+<li>Uniform on the unit interval (0, 1): The density is flat at 1 on (0, 1) and 0 elsewhere. We won’t get into what density means as much here, but intuitively, this is saying that there’s an equally likely chance of getting any value on the interval (0, 1).</li>
+<li>Normal(<span class="math inline">\(\mu\)</span>, <span class="math inline">\(\sigma^2\)</span>): The probability density is specified by <span class="math inline">\(\frac{1}{\sqrt{2\pi}}e^{-\frac{1}{2}\frac{(x-\mu)^2}{\sigma^2}}\)</span>. This bell-shaped distribution comes up fairly often in data, in part due to the Central Limit Theorem you saw back in Data 8.</li>
+</ol>
+</section>
+</section>
+<section id="expectation-and-variance" class="level2" data-number="17.2">
+<h2 data-number="17.2" class="anchored" data-anchor-id="expectation-and-variance"><span class="header-section-number">17.2</span> Expectation and Variance</h2>
+<p>There are several ways to describe a random variable. The methods shown above —— a table of all samples <span class="math inline">\(s, X(s)\)</span>, distribution table <span class="math inline">\(P(X=x)\)</span>, and histograms —— are all definitions that <em>fully describe</em> a random variable. Often, it is easier to describe a random variable using some <em>numerical summary</em> rather than fully defining its distribution. These numerical summaries are numbers that characterize some properties of the random variable. Because they give a “summary” of how the variable tends to behave, they are <em>not</em> random. Instead, think of them as a static number that describes a certain property of the random variable. In Data 100, we will focus our attention on the expectation and variance of a random variable.</p>
+<section id="expectation" class="level3" data-number="17.2.1">
+<h3 data-number="17.2.1" class="anchored" data-anchor-id="expectation"><span class="header-section-number">17.2.1</span> Expectation</h3>
+<p>The <strong>expectation</strong> of a random variable <span class="math inline">\(X\)</span> is the <strong>weighted average</strong> of the values of <span class="math inline">\(X\)</span>, where the weights are the probabilities of each value occurring. There are two equivalent ways to compute the expectation:</p>
+<ol type="1">
+<li>Apply the weights one <em>sample</em> at a time: <span class="math display">\[\mathbb{E}[X] = \sum_{\text{all possible } s} X(s) P(s)\]</span>.</li>
+<li>Apply the weights one possible <em>value</em> at a time: <span class="math display">\[\mathbb{E}[X] = \sum_{\text{all possible } x} x P(X=x)\]</span></li>
+</ol>
+<p>The latter is more commonly used as we are usually just given the distribution, not all possible samples.</p>
+<p>We want to emphasize that the expectation is a <em>number</em>, not a random variable. Expectation is a generalization of the average, and it has the same units as the random variable. It is also the center of gravity of the probability distribution histogram, meaning if we simulate the variable many times, it is the long-run average of the simulated values.</p>
+<section id="example-1-coin-toss" class="level4" data-number="17.2.1.1">
+<h4 data-number="17.2.1.1" class="anchored" data-anchor-id="example-1-coin-toss"><span class="header-section-number">17.2.1.1</span> Example 1: Coin Toss</h4>
+<p>Going back to our coin toss example, we define a random variable <span class="math inline">\(X\)</span> as: <span class="math display">\[X = \begin{cases}
+      1, \text{if the coin lands heads} \\
+      0, \text{if the coin lands tails}
+   \end{cases}\]</span></p>
+<p>We can calculate its expectation <span class="math inline">\(\mathbb{E}[X]\)</span> using the second method of applying the weights one possible value at a time: <span class="math display">\[\begin{align}
+\mathbb{E}[X] &amp;= \sum_{x} x P(X=x) \\
+&amp;= 1 * 0.5 + 0 * 0.5 \\
+&amp;= 0.5
+\end{align}\]</span></p>
+<p>Note that <span class="math inline">\(\mathbb{E}[X] = 0.5\)</span> is not a possible value of <span class="math inline">\(X\)</span>; it’s an average. <strong>The expectation of X does not need to be a possible value of X</strong>.</p>
+</section>
+<section id="example-2" class="level4" data-number="17.2.1.2">
+<h4 data-number="17.2.1.2" class="anchored" data-anchor-id="example-2"><span class="header-section-number">17.2.1.2</span> Example 2</h4>
+<p>Consider the random variable <span class="math inline">\(X\)</span>:</p>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th><span class="math inline">\(x\)</span></th>
+<th><span class="math inline">\(P(X=x)\)</span></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>3</td>
+<td>0.1</td>
+</tr>
+<tr class="even">
+<td>4</td>
+<td>0.2</td>
+</tr>
+<tr class="odd">
+<td>6</td>
+<td>0.4</td>
+</tr>
+<tr class="even">
+<td>8</td>
+<td>0.3</td>
+</tr>
+</tbody>
+</table>
+<p>To calculate it’s expectation, <span class="math display">\[\begin{align}
+\mathbb{E}[X] &amp;= \sum_{x} x P(X=x) \\
+&amp;= 3 * 0.1 + 4 * 0.2 + 6 * 0.4 + 8 * 0.3 \\
+&amp;= 0.3 + 0.8 + 2.4 + 2.4 \\
+&amp;= 5.9
+\end{align}\]</span></p>
+<p>Again, note that <span class="math inline">\(\mathbb{E}[X] = 5.9\)</span> is not a possible value of <span class="math inline">\(X\)</span>; it’s an average. <strong>The expectation of X does not need to be a possible value of X</strong>.</p>
+</section>
+</section>
+<section id="variance" class="level3" data-number="17.2.2">
+<h3 data-number="17.2.2" class="anchored" data-anchor-id="variance"><span class="header-section-number">17.2.2</span> Variance</h3>
+<p>The <strong>variance</strong> of a random variable is a measure of its chance error. It is defined as the expected squared deviation from the expectation of <span class="math inline">\(X\)</span>. Put more simply, variance asks: how far does <span class="math inline">\(X\)</span> typically vary from its average value, just by chance? What is the spread of <span class="math inline">\(X\)</span>’s distribution?</p>
+<p><span class="math display">\[\text{Var}(X) = \mathbb{E}[(X-\mathbb{E}[X])^2]\]</span></p>
+<p>The units of variance are the square of the units of <span class="math inline">\(X\)</span>. To get it back to the right scale, use the standard deviation of <span class="math inline">\(X\)</span>: <span class="math display">\[\text{SD}(X) = \sqrt{\text{Var}(X)}\]</span></p>
+<p>Like with expectation, <strong>variance and standard deviation are numbers, not random variables</strong>! Variance helps us describe the variability of a random variable. It is the expected squared error between the random variable and its expected value. As you will see shortly, we can use variance to help us quantify the chance error that arises when using a sample <span class="math inline">\(X\)</span> to estimate the population mean.</p>
+<p>By <a href="https://www.inferentialthinking.com/chapters/14/2/Variability.html#Chebychev's-Bounds">Chebyshev’s inequality</a>, which you saw in Data 8, no matter what the shape of the distribution of <span class="math inline">\(X\)</span> is, the vast majority of the probability lies in the interval “expectation plus or minus a few SDs.”</p>
+<p>If we expand the square and use properties of expectation, we can re-express variance as the <strong>computational formula for variance</strong>.</p>
+<p><span class="math display">\[\text{Var}(X) = \mathbb{E}[X^2] - (\mathbb{E}[X])^2\]</span></p>
+<p>This form is often more convenient to use when computing the variance of a variable by hand, and it is also useful in Mean Squared Error calculations, as <span class="math inline">\(\mathbb{E}[X^2] = \text{Var}(X)\)</span> if <span class="math inline">\(X\)</span> is centered and <span class="math inline">\(E(X)=0\)</span>.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-3-contents" aria-controls="callout-3" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Proof
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-3" class="callout-3-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p><span class="math display">\[\begin{align}
+   \text{Var}(X) &amp;= \mathbb{E}[(X-\mathbb{E}[X])^2] \\
+   &amp;= \mathbb{E}(X^2 - 2X\mathbb{E}(X) + (\mathbb{E}(X))^2) \\
+   &amp;= \mathbb{E}(X^2) - 2 \mathbb{E}(X)\mathbb{E}(X) +( \mathbb{E}(X))^2\\
+   &amp;= \mathbb{E}[X^2] - (\mathbb{E}[X])^2
+\end{align}\]</span></p>
+</div>
+</div>
+</div>
+<p>How do we compute <span class="math inline">\(\mathbb{E}[X^2]\)</span>? Any function of a random variable is <em>also</em> a random variable. That means that by squaring <span class="math inline">\(X\)</span>, we’ve created a new random variable. To compute <span class="math inline">\(\mathbb{E}[X^2]\)</span>, we can simply apply our definition of expectation to the random variable <span class="math inline">\(X^2\)</span>.</p>
+<p><span class="math display">\[\mathbb{E}[X^2] = \sum_{x} x^2 P(X = x)\]</span></p>
+</section>
+<section id="example-die" class="level3" data-number="17.2.3">
+<h3 data-number="17.2.3" class="anchored" data-anchor-id="example-die"><span class="header-section-number">17.2.3</span> Example: Die</h3>
+<p>Let <span class="math inline">\(X\)</span> be the outcome of a single fair die roll. <span class="math inline">\(X\)</span> is a random variable defined as <span class="math display">\[X = \begin{cases}
+      \frac{1}{6}, \text{if } x \in \{1,2,3,4,5,6\} \\
+      0, \text{otherwise}
+   \end{cases}\]</span></p>
+<div class="callout callout-style-default callout-caution no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-4-contents" aria-controls="callout-4" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+What’s the expectation, <span class="math inline">\(\mathbb{E}[X]?\)</span>
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-4" class="callout-4-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p><span class="math display">\[ \begin{align}
+         \mathbb{E}[X] &amp;= 1\big(\frac{1}{6}\big) + 2\big(\frac{1}{6}\big) + 3\big(\frac{1}{6}\big) + 4\big(\frac{1}{6}\big) + 5\big(\frac{1}{6}\big) + 6\big(\frac{1}{6}\big) \\
+         &amp;= \big(\frac{1}{6}\big)( 1 + 2 + 3 + 4 + 5 + 6) \\
+         &amp;= \frac{7}{2}
+      \end{align}\]</span></p>
+</div>
+</div>
+</div>
+<div class="callout callout-style-default callout-caution no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-5-contents" aria-controls="callout-5" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+What’s the variance, <span class="math inline">\(\text{Var}(X)?\)</span>
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-5" class="callout-5-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p>Using Approach 1 (definition): <span class="math display">\[\begin{align}
+      \text{Var}(X) &amp;= \big(\frac{1}{6}\big)((1 - \frac{7}{2})^2 + (2 - \frac{7}{2})^2 + (3 - \frac{7}{2})^2 + (4 - \frac{7}{2})^2 + (5 - \frac{7}{2})^2 + (6 - \frac{7}{2})^2) \\
+      &amp;= \frac{35}{12}
+   \end{align}\]</span></p>
+<p>Using Approach 2 (property): <span class="math display">\[\mathbb{E}[X^2] = \sum_{x} x^2 P(X = x) = \frac{91}{6}\]</span> <span class="math display">\[\text{Var}(X) = \frac{91}{6} - (\frac{7}{2})^2 = \frac{35}{12}\]</span></p>
+</div>
+</div>
+</div>
+<p>We can summarize our discussion so far in the following diagram:</p>
+<p align="center">
+<img src="images/exp_var.png" alt="distribution" width="800">
+</p>
+</section>
+</section>
+<section id="sums-of-random-variables" class="level2" data-number="17.3">
+<h2 data-number="17.3" class="anchored" data-anchor-id="sums-of-random-variables"><span class="header-section-number">17.3</span> Sums of Random Variables</h2>
+<p>Often, we will work with multiple random variables at the same time. A function of a random variable is also a random variable. If you create multiple random variables based on your sample, then functions of those random variables are also random variables.</p>
+<p>For example, if <span class="math inline">\(X_1, X_2, ..., X_n\)</span> are random variables, then so are all of these:</p>
+<ul>
+<li><span class="math inline">\(X_n^2\)</span></li>
+<li><span class="math inline">\(\#\{i : X_i &gt; 10\}\)</span></li>
+<li><span class="math inline">\(\text{max}(X_1, X_2, ..., X_n)\)</span></li>
+<li><span class="math inline">\(\frac{1}{n} \sum_{i=1}^n (X_i - c)^2\)</span></li>
+<li><span class="math inline">\(\frac{1}{n} \sum_{i=1}^n X_i\)</span></li>
+</ul>
+<p>Many functions of random variables that we are interested in (e.g., counts, means) involve sums of random variables, so let’s dive deeper into the properties of sums of random variables.</p>
+<section id="properties-of-expectation" class="level3" data-number="17.3.1">
+<h3 data-number="17.3.1" class="anchored" data-anchor-id="properties-of-expectation"><span class="header-section-number">17.3.1</span> Properties of Expectation</h3>
+<p>Instead of simulating full distributions, we often just compute expectation and variance directly. Recall the definition of expectation: <span class="math display">\[\mathbb{E}[X] = \sum_{x} x P(X=x)\]</span></p>
+<p>From it, we can derive some useful properties:</p>
+<ol type="1">
+<li><strong>Linearity of expectation</strong>. The expectation of the linear transformation <span class="math inline">\(aX+b\)</span>, where <span class="math inline">\(a\)</span> and <span class="math inline">\(b\)</span> are constants, is:</li>
+</ol>
+<p><span class="math display">\[\mathbb{E}[aX+b] = aE[\mathbb{X}] + b\]</span></p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-6-contents" aria-controls="callout-6" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Proof
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-6" class="callout-6-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p><span class="math display">\[\begin{align}
+        \mathbb{E}[aX+b] &amp;= \sum_{x} (ax + b) P(X=x) \\
+        &amp;= \sum_{x} (ax P(X=x) + bP(X=x)) \\
+        &amp;= a\sum_{x}P(X=x) + b\sum_{x}P(X=x)\\
+        &amp;= a\mathbb{E}(X) + b * 1
+    \end{align}\]</span></p>
+</div>
+</div>
+</div>
+<ol start="2" type="1">
+<li>Expectation is also linear in <em>sums</em> of random variables.</li>
+</ol>
+<p><span class="math display">\[\mathbb{E}[X+Y] = \mathbb{E}[X] + \mathbb{E}[Y]\]</span></p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-7-contents" aria-controls="callout-7" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Proof
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-7" class="callout-7-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p><span class="math display">\[\begin{align}
+    \mathbb{E}[X+Y] &amp;= \sum_{s} (X+Y)(s) P(s) \\
+    &amp;= \sum_{s} (X(s)P(s) + Y(s)P(s)) \\
+    &amp;= \sum_{s} X(s)P(s) + \sum_{s} Y(s)P(s)\\
+    &amp;= \mathbb{E}[X] + \mathbb{E}[Y]
+\end{align}\]</span></p>
+</div>
+</div>
+</div>
+<ol start="3" type="1">
+<li>If <span class="math inline">\(g\)</span> is a non-linear function, then in general, <span class="math display">\[\mathbb{E}[g(X)] \neq g(\mathbb{E}[X])\]</span> For example, if <span class="math inline">\(X\)</span> is -1 or 1 with equal probability, then <span class="math inline">\(\mathbb{E}[X] = 0\)</span>, but <span class="math inline">\(\mathbb{E}[X^2] = 1 \neq 0\)</span>.</li>
+</ol>
+</section>
+<section id="properties-of-variance" class="level3" data-number="17.3.2">
+<h3 data-number="17.3.2" class="anchored" data-anchor-id="properties-of-variance"><span class="header-section-number">17.3.2</span> Properties of Variance</h3>
+<p>Let’s now get into the properties of variance. Recall the definition of variance: <span class="math display">\[\text{Var}(X) = \mathbb{E}[(X-\mathbb{E}[X])^2]\]</span></p>
+<p>Combining it with the properties of expectation, we can derive some useful properties:</p>
+<ol type="1">
+<li>Unlike expectation, variance is <em>non-linear</em>. The variance of the linear transformation <span class="math inline">\(aX+b\)</span> is: <span class="math display">\[\text{Var}(aX+b) = a^2 \text{Var}(X)\]</span></li>
+</ol>
+<ul>
+<li>Subsequently, <span class="math display">\[\text{SD}(aX+b) = |a| \text{SD}(X)\]</span></li>
+<li>The full proof of this fact can be found using the definition of variance. As general intuition, consider that <span class="math inline">\(aX+b\)</span> scales the variable <span class="math inline">\(X\)</span> by a factor of <span class="math inline">\(a\)</span>, then shifts the distribution of <span class="math inline">\(X\)</span> by <span class="math inline">\(b\)</span> units.</li>
+</ul>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-8-contents" aria-controls="callout-8" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Proof
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-8" class="callout-8-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p>We know that <span class="math display">\[\mathbb{E}[aX+b] = aE[\mathbb{X}] + b\]</span></p>
+<p>In order to compute <span class="math inline">\(\text{Var}(aX+b)\)</span>, consider that a shift by <span class="math inline">\(b\)</span> units does not affect spread, so <span class="math inline">\(\text{Var}(aX+b) = \text{Var}(aX)\)</span>.</p>
+<p>Then, <span class="math display">\[\begin{align}
+    \text{Var}(aX+b) &amp;= \text{Var}(aX) \\
+    &amp;= E((aX)^2) - (E(aX))^2 \\
+    &amp;= E(a^2 X^2) - (aE(X))^2\\
+    &amp;= a^2 (E(X^2) - (E(X))^2) \\
+    &amp;= a^2 \text{Var}(X)
+\end{align}\]</span></p>
+</div>
+</div>
+</div>
+<ul>
+<li>Shifting the distribution by <span class="math inline">\(b\)</span> <em>does not</em> impact the <em>spread</em> of the distribution. Thus, <span class="math inline">\(\text{Var}(aX+b) = \text{Var}(aX)\)</span>.</li>
+<li>Scaling the distribution by <span class="math inline">\(a\)</span> <em>does</em> impact the spread of the distribution.</li>
+</ul>
+<p align="center">
+<img src="images/transformation.png" alt="transformation" width="600">
+</p>
+<ol start="2" type="1">
+<li>Variance of sums of random variables is affected by the (in)dependence of the random variables. <span class="math display">\[\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) + 2\text{cov}(X,Y)\]</span> <span class="math display">\[\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) \qquad \text{if } X, Y \text{ independent}\]</span></li>
+</ol>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-9-contents" aria-controls="callout-9" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Proof
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-9" class="callout-9-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p>The variance of a sum is affected by the dependence between the two random variables that are being added. Let’s expand the definition of <span class="math inline">\(\text{Var}(X + Y)\)</span> to see what’s going on.</p>
+<p>To simplify the math, let <span class="math inline">\(\mu_x = \mathbb{E}[X]\)</span> and <span class="math inline">\(\mu_y = \mathbb{E}[Y]\)</span>.</p>
+<p><span class="math display">\[ \begin{align}
+\text{Var}(X + Y) &amp;= \mathbb{E}[(X+Y- \mathbb{E}(X+Y))^2] \\
+&amp;= \mathbb{E}[((X - \mu_x) + (Y - \mu_y))^2] \\
+&amp;= \mathbb{E}[(X - \mu_x)^2 + 2(X - \mu_x)(Y - \mu_y) + (Y - \mu_y)^2] \\
+&amp;= \mathbb{E}[(X - \mu_x)^2] + \mathbb{E}[(Y - \mu_y)^2] + \mathbb{E}[(X - \mu_x)(Y - \mu_y)] \\
+&amp;= \text{Var}(X) + \text{Var}(Y) + \mathbb{E}[(X - \mu_x)(Y - \mu_y)]
+\end{align}\]</span></p>
+</div>
+</div>
+</div>
+</section>
+<section id="covariance-and-correlation" class="level3" data-number="17.3.3">
+<h3 data-number="17.3.3" class="anchored" data-anchor-id="covariance-and-correlation"><span class="header-section-number">17.3.3</span> Covariance and Correlation</h3>
+<p>We define the <strong>covariance</strong> of two random variables as the expected product of deviations from expectation. Put more simply, covariance is a generalization of variance to variance:</p>
+<p><span class="math display">\[\text{Cov}(X, X) = \mathbb{E}[(X - \mathbb{E}[X])^2] = \text{Var}(X)\]</span></p>
+<p><span class="math display">\[\text{Cov}(X, Y) = \mathbb{E}[(X - \mathbb{E}[X])(Y - \mathbb{E}[Y])]\]</span></p>
+<p>We can treat the covariance as a measure of association. Remember the definition of correlation given when we first established SLR?</p>
+<p><span class="math display">\[r(X, Y) = \mathbb{E}\left[\left(\frac{X-\mathbb{E}[X]}{\text{SD}(X)}\right)\left(\frac{Y-\mathbb{E}[Y]}{\text{SD}(Y)}\right)\right] = \frac{\text{Cov}(X, Y)}{\text{SD}(X)\text{SD}(Y)}\]</span></p>
+<p>It turns out we’ve been quietly using covariance for some time now! If <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are independent, then <span class="math inline">\(\text{Cov}(X, Y) =0\)</span> and <span class="math inline">\(r(X, Y) = 0\)</span>. Note, however, that the converse is not always true: <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> could have <span class="math inline">\(\text{Cov}(X, Y) = r(X, Y) = 0\)</span> but not be independent.</p>
+</section>
+<section id="equal-vs.-identically-distributed-vs.-i.i.d" class="level3" data-number="17.3.4">
+<h3 data-number="17.3.4" class="anchored" data-anchor-id="equal-vs.-identically-distributed-vs.-i.i.d"><span class="header-section-number">17.3.4</span> Equal vs.&nbsp;Identically Distributed vs.&nbsp;i.i.d</h3>
+<p>Suppose that we have two random variables <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span>:</p>
+<ul>
+<li><span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are <strong>equal</strong> if <span class="math inline">\(X(s) = Y(s)\)</span> for every sample <span class="math inline">\(s\)</span>. Regardless of the exact sample drawn, <span class="math inline">\(X\)</span> is always equal to <span class="math inline">\(Y\)</span>.</li>
+<li><span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are <strong>identically distributed</strong> if the distribution of <span class="math inline">\(X\)</span> is equal to the distribution of <span class="math inline">\(Y\)</span>. We say “<span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are equal in distribution.” That is, <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> take on the same set of possible values, and each of these possible values is taken with the same probability. On any specific sample <span class="math inline">\(s\)</span>, identically distributed variables do <em>not</em> necessarily share the same value. If <span class="math inline">\(X = Y\)</span>, then <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are identically distributed; however, the converse is not true (ex: <span class="math inline">\(Y = 7 - X\)</span>, <span class="math inline">\(X\)</span> is a die)</li>
+<li><span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are <strong>independent and identically distributed (i.i.d)</strong> if
+<ol type="1">
+<li>The variables are identically distributed.</li>
+<li>Knowing the outcome of one variable does not influence our belief of the outcome of the other.</li>
+</ol></li>
+</ul>
+<p>Note that in Data 100, you’ll never be expected to prove that random variables are i.i.d.</p>
+<p>Now let’s walk through an example. Say <span class="math inline">\(X_1\)</span> and <span class="math inline">\(X_2\)</span> be numbers on rolls of two fair die. <span class="math inline">\(X_1\)</span> and <span class="math inline">\(X_2\)</span> are i.i.d, so <span class="math inline">\(X_1\)</span> and <span class="math inline">\(X_2\)</span> have the same distribution. However, the sums <span class="math inline">\(Y = X_1 + X_1 = 2X_1\)</span> and <span class="math inline">\(Z=X_1+X_2\)</span> have different distributions but the same expectation.</p>
+<p align="center">
+<img src="images/yz_distribution.png" alt="distribution" width="=500">
+</p>
+<p>However, <span class="math inline">\(Y = X_1\)</span> has a larger variance.</p>
+<p align="center">
+<img src="images/yz.png" alt="distribution" width="200">
+</p>
+</section>
+<section id="example-bernoulli-random-variable" class="level3" data-number="17.3.5">
+<h3 data-number="17.3.5" class="anchored" data-anchor-id="example-bernoulli-random-variable"><span class="header-section-number">17.3.5</span> Example: Bernoulli Random Variable</h3>
+<p>To get some practice with the formulas discussed so far, let’s derive the expectation and variance for a Bernoulli(<span class="math inline">\(p\)</span>) random variable. If <span class="math inline">\(X\)</span> ~ Bernoulli(<span class="math inline">\(p\)</span>),</p>
+<p><span class="math inline">\(\mathbb{E}[X] = 1 \cdot p + 0 \cdot (1 - p) = p\)</span></p>
+<p>To compute the variance, we will use the computational formula. We first find that: <span class="math inline">\(\mathbb{E}[X^2] = 1^2 \cdot p + 0^2 \cdot (1 - p) = p\)</span></p>
+<p>From there, let’s calculate our variance: <span class="math inline">\(\text{Var}(X) = \mathbb{E}[X^2] - \mathbb{E}[X]^2 = p - p^2 = p(1-p)\)</span></p>
+</section>
+<section id="example-binomial-random-variable" class="level3" data-number="17.3.6">
+<h3 data-number="17.3.6" class="anchored" data-anchor-id="example-binomial-random-variable"><span class="header-section-number">17.3.6</span> Example: Binomial Random Variable</h3>
+<p>Let <span class="math inline">\(Y\)</span> ~ Binomial(<span class="math inline">\(n\)</span>, <span class="math inline">\(p\)</span>). We can think of <span class="math inline">\(Y\)</span> as being the sum of <span class="math inline">\(n\)</span> i.i.d. Bernoulli(<span class="math inline">\(p\)</span>) random variables. Mathematically, this translates to</p>
+<p><span class="math display">\[Y = \sum_{i=1}^n X_i\]</span></p>
+<p>where <span class="math inline">\(X_i\)</span> is the indicator of a success on trial <span class="math inline">\(i\)</span>.</p>
+<p>Using linearity of expectation,</p>
+<p><span class="math display">\[\mathbb{E}[Y] = \sum_{i=1}^n \mathbb{E}[X_i] = np\]</span></p>
+<p>For the variance, since each <span class="math inline">\(X_i\)</span> is independent of the other, <span class="math inline">\(\text{Cov}(X_i, X_j) = 0\)</span>,</p>
+<p><span class="math display">\[\text{Var}(Y) =  \sum_{i=1}^n \text{Var}[X_i] = np(1-p)\]</span></p>
+</section>
+<section id="summary" class="level3" data-number="17.3.7">
+<h3 data-number="17.3.7" class="anchored" data-anchor-id="summary"><span class="header-section-number">17.3.7</span> Summary</h3>
+<ul>
+<li>Let <span class="math inline">\(X\)</span> be a random variable with distribution <span class="math inline">\(P(X=x)\)</span>.
+<ul>
+<li><span class="math inline">\(\mathbb{E}[X] = \sum_{x} x P(X=x)\)</span></li>
+<li><span class="math inline">\(\text{Var}(X) = \mathbb{E}[(X-\mathbb{E}[X])^2] = \mathbb{E}[X^2] - (\mathbb{E}[X])^2\)</span></li>
+</ul></li>
+<li>Let <span class="math inline">\(a\)</span> and <span class="math inline">\(b\)</span> be scalar values.
+<ul>
+<li><span class="math inline">\(\mathbb{E}[aX+b] = aE[\mathbb{X}] + b\)</span></li>
+<li><span class="math inline">\(\text{Var}(aX+b) = a^2 \text{Var}(X)\)</span></li>
+</ul></li>
+<li>Let <span class="math inline">\(Y\)</span> be another random variable.
+<ul>
+<li><span class="math inline">\(\mathbb{E}[X+Y] = \mathbb{E}[X] + \mathbb{E}[Y]\)</span></li>
+<li><span class="math inline">\(\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) + 2\text{Cov}(X,Y)\)</span></li>
+</ul></li>
+</ul>
+<p>Note that <span class="math inline">\(\text{Cov}(X,Y)\)</span> would equal 0 if <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are independent.</p>
+
+
+<!-- -->
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../cv_regularization/cv_reg.html" class="pagination-link" aria-label="Cross Validation and Regularization">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../probability_2/probability_2.html" class="pagination-link" aria-label="Estimators, Bias, and Variance">
+        <span class="nav-page-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb1" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Random Variables</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Random Variables</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb1-24"><a href="#cb1-24" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb1-25"><a href="#cb1-25" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb1-26"><a href="#cb1-26" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb1-27"><a href="#cb1-27" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb1-28"><a href="#cb1-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-29"><a href="#cb1-29" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb1-30"><a href="#cb1-30" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb1-31"><a href="#cb1-31" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Define a random variable in terms of its distribution</span>
+<span id="cb1-32"><a href="#cb1-32" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Compute the expectation and variance of a random variable</span>
+<span id="cb1-33"><a href="#cb1-33" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Gain familiarity with the Bernoulli and binomial random variables</span>
+<span id="cb1-34"><a href="#cb1-34" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-35"><a href="#cb1-35" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-36"><a href="#cb1-36" aria-hidden="true" tabindex="-1"></a>In the past few lectures, we've examined the role of complexity in influencing model performance. We've considered model complexity in the context of a tradeoff between two competing factors: model variance and training error. </span>
+<span id="cb1-37"><a href="#cb1-37" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-38"><a href="#cb1-38" aria-hidden="true" tabindex="-1"></a>So far, our analysis has been mostly qualitative. We've acknowledged that our choice of model complexity needs to strike a balance between model variance and training error, but we haven't yet discussed *why* exactly this tradeoff exists.</span>
+<span id="cb1-39"><a href="#cb1-39" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-40"><a href="#cb1-40" aria-hidden="true" tabindex="-1"></a>To better understand the origin of this tradeoff, we will need to dive into **random variables**. The next two course notes on probability will be a brief digression from our work on modeling so we can build up the concepts needed to understand this so-called **bias-variance tradeoff**. In specific, we will cover:</span>
+<span id="cb1-41"><a href="#cb1-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-42"><a href="#cb1-42" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Random Variables: introduce random variables, considering the concepts of expectation, variance, and covariance</span>
+<span id="cb1-43"><a href="#cb1-43" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Estimators, Bias, and Variance: re-express the ideas of model variance and training error in terms of random variables and use this new perspective to investigate our choice of model complexity</span>
+<span id="cb1-44"><a href="#cb1-44" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-45"><a href="#cb1-45" aria-hidden="true" tabindex="-1"></a>We'll go over just enough probability to help you understand its implications for modeling, but if you want to go a step further, take Data 140, CS 70, and/or EECS 126.</span>
+<span id="cb1-46"><a href="#cb1-46" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-47"><a href="#cb1-47" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collapse="false"}</span>
+<span id="cb1-48"><a href="#cb1-48" aria-hidden="true" tabindex="-1"></a><span class="fu">## Data 8 Recap</span></span>
+<span id="cb1-49"><a href="#cb1-49" aria-hidden="true" tabindex="-1"></a>Recall the following concepts from Data 8: </span>
+<span id="cb1-50"><a href="#cb1-50" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-51"><a href="#cb1-51" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Sample mean: The mean of the random sample</span>
+<span id="cb1-52"><a href="#cb1-52" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Central Limit Theorem: If you draw a large random sample with replacement, then, regardless of the population distribution, the probability distribution of the sample mean</span>
+<span id="cb1-53"><a href="#cb1-53" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-54"><a href="#cb1-54" aria-hidden="true" tabindex="-1"></a>    a. is roughly normal</span>
+<span id="cb1-55"><a href="#cb1-55" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-56"><a href="#cb1-56" aria-hidden="true" tabindex="-1"></a>    b. is centered at the population mean</span>
+<span id="cb1-57"><a href="#cb1-57" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb1-58"><a href="#cb1-58" aria-hidden="true" tabindex="-1"></a>    c. has an $SD = \frac{\text{population SD}}{\sqrt{\text{sample size}}}$</span>
+<span id="cb1-59"><a href="#cb1-59" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-60"><a href="#cb1-60" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-61"><a href="#cb1-61" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-62"><a href="#cb1-62" aria-hidden="true" tabindex="-1"></a>In Data 100, we want to understand the broader relationship between the following:</span>
+<span id="cb1-63"><a href="#cb1-63" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-64"><a href="#cb1-64" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>**Population parameter**: a number that describes something about the population</span>
+<span id="cb1-65"><a href="#cb1-65" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>**Sample statistic**: an estimate of the number computed on a sample</span>
+<span id="cb1-66"><a href="#cb1-66" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-67"><a href="#cb1-67" aria-hidden="true" tabindex="-1"></a><span class="fu">## Random Variables and Distributions</span></span>
+<span id="cb1-68"><a href="#cb1-68" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-69"><a href="#cb1-69" aria-hidden="true" tabindex="-1"></a>Suppose we generate a set of random data, like a random sample from some population. A **random variable** is a *function* from the outcome of a random event to a number.</span>
+<span id="cb1-70"><a href="#cb1-70" aria-hidden="true" tabindex="-1"></a> </span>
+<span id="cb1-71"><a href="#cb1-71" aria-hidden="true" tabindex="-1"></a>It is *random* since our sample was drawn at random; it is *variable* because its exact value depends on how this random sample came out. As such, the domain or input of our random variable is all possible outcomes for some random event in a *sample space*, and its range or output is the real number line. We typically denote random variables with uppercase letters, such as $X$ or $Y$. In contrast, note that regular variables tend to be denoted using lowercase letters. Sometimes we also use uppercase letters to refer to matrices (such as your design matrix $\mathbb{X}$), but we will do our best to be clear with the notation.</span>
+<span id="cb1-72"><a href="#cb1-72" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-73"><a href="#cb1-73" aria-hidden="true" tabindex="-1"></a>To motivate what this (rather abstract) definition means, let's consider the following examples:</span>
+<span id="cb1-74"><a href="#cb1-74" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-75"><a href="#cb1-75" aria-hidden="true" tabindex="-1"></a><span class="fu">### Example: Tossing a Coin</span></span>
+<span id="cb1-76"><a href="#cb1-76" aria-hidden="true" tabindex="-1"></a>Let's formally define a fair coin toss. A fair coin can land on heads ($H$) or tails ($T$), each with a probability of 0.5. With these possible outcomes, we can define a random variable $X$ as: </span>
+<span id="cb1-77"><a href="#cb1-77" aria-hidden="true" tabindex="-1"></a>$$X = \begin{cases} </span>
+<span id="cb1-78"><a href="#cb1-78" aria-hidden="true" tabindex="-1"></a>      1, \text{if the coin lands heads} <span class="sc">\\</span></span>
+<span id="cb1-79"><a href="#cb1-79" aria-hidden="true" tabindex="-1"></a>      0, \text{if the coin lands tails} </span>
+<span id="cb1-80"><a href="#cb1-80" aria-hidden="true" tabindex="-1"></a>   \end{cases}$$</span>
+<span id="cb1-81"><a href="#cb1-81" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-82"><a href="#cb1-82" aria-hidden="true" tabindex="-1"></a>$X$ is a function with a domain, or input, of $<span class="sc">\{</span>H, T<span class="sc">\}</span>$ and a range, or output, of $<span class="sc">\{</span>1, 0<span class="sc">\}</span>$. In practice, while we don't use the following function notation, you could write the above as </span>
+<span id="cb1-83"><a href="#cb1-83" aria-hidden="true" tabindex="-1"></a>$$X = \begin{cases}  X(H) = 1 <span class="sc">\\</span> X(T) = 0 \end{cases}$$</span>
+<span id="cb1-84"><a href="#cb1-84" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-85"><a href="#cb1-85" aria-hidden="true" tabindex="-1"></a><span class="fu">### Example: Sampling Data 100 Students</span></span>
+<span id="cb1-86"><a href="#cb1-86" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-87"><a href="#cb1-87" aria-hidden="true" tabindex="-1"></a>Suppose we draw a random sample $s$ of size 3 from all students enrolled in Data 100. </span>
+<span id="cb1-88"><a href="#cb1-88" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-89"><a href="#cb1-89" aria-hidden="true" tabindex="-1"></a>We can define $Y$ as the number of data science students in our sample. Its domain is all possible samples of size 3, and its range is $<span class="sc">\{</span>0, 1, 2, 3<span class="sc">\}</span>$.</span>
+<span id="cb1-90"><a href="#cb1-90" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-91"><a href="#cb1-91" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-92"><a href="#cb1-92" aria-hidden="true" tabindex="-1"></a>  &lt;img src="images/rv.png" alt='rv' width='600' class="center"&gt;</span>
+<span id="cb1-93"><a href="#cb1-93" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-94"><a href="#cb1-94" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-95"><a href="#cb1-95" aria-hidden="true" tabindex="-1"></a>Note that we can use random variables in mathematical expressions to create new random variables. </span>
+<span id="cb1-96"><a href="#cb1-96" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-97"><a href="#cb1-97" aria-hidden="true" tabindex="-1"></a>For example, let's say we sample 3 students at random from lecture and look at their midterm scores. Let $X_1$, $X_2$, and $X_3$ represent each student's midterm grade. </span>
+<span id="cb1-98"><a href="#cb1-98" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-99"><a href="#cb1-99" aria-hidden="true" tabindex="-1"></a>We can use these random variables to create a new random variable, $Y$, which represents the average of the 3 scores: $Y = (X_1 + X_2 + X_3)/3$.</span>
+<span id="cb1-100"><a href="#cb1-100" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-101"><a href="#cb1-101" aria-hidden="true" tabindex="-1"></a>As we're creating this random variable, a few questions arise:</span>
+<span id="cb1-102"><a href="#cb1-102" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-103"><a href="#cb1-103" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>What can we say about the distribution of $Y$?</span>
+<span id="cb1-104"><a href="#cb1-104" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>How does it depend on the distribution of $X_1$, $X_2$, and $X_3$?</span>
+<span id="cb1-105"><a href="#cb1-105" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-106"><a href="#cb1-106" aria-hidden="true" tabindex="-1"></a>But, what exactly is a distribution? Let's dive into this! </span>
+<span id="cb1-107"><a href="#cb1-107" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-108"><a href="#cb1-108" aria-hidden="true" tabindex="-1"></a><span class="fu">### Distributions</span></span>
+<span id="cb1-109"><a href="#cb1-109" aria-hidden="true" tabindex="-1"></a>To define any random variable $X$, we need to be able to specify 2 things: </span>
+<span id="cb1-110"><a href="#cb1-110" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-111"><a href="#cb1-111" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>**Possible values**: the set of values the random variable can take on.</span>
+<span id="cb1-112"><a href="#cb1-112" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>**Probabilities**: the set of probabilities describing how the total probability of 100% is split over the possible values.</span>
+<span id="cb1-113"><a href="#cb1-113" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-114"><a href="#cb1-114" aria-hidden="true" tabindex="-1"></a>If $X$ is discrete (has a finite number of possible values), the probability that a random variable $X$ takes on the value $x$ is given by $P(X=x)$, and probabilities must sum to 1: $\sum_{\text{all } x} P(X=x) = 1$,</span>
+<span id="cb1-115"><a href="#cb1-115" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-116"><a href="#cb1-116" aria-hidden="true" tabindex="-1"></a>We can often display this using a **probability distribution table**. In the coin toss example, the probability distribution table of $X$ is given by.</span>
+<span id="cb1-117"><a href="#cb1-117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-118"><a href="#cb1-118" aria-hidden="true" tabindex="-1"></a>| $x$ | $P(X=x)$ | </span>
+<span id="cb1-119"><a href="#cb1-119" aria-hidden="true" tabindex="-1"></a>| --- | -------- |</span>
+<span id="cb1-120"><a href="#cb1-120" aria-hidden="true" tabindex="-1"></a>| 0 | $\frac{1}{2}$ | </span>
+<span id="cb1-121"><a href="#cb1-121" aria-hidden="true" tabindex="-1"></a>| 1 | $\frac{1}{2}$ |</span>
+<span id="cb1-122"><a href="#cb1-122" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-123"><a href="#cb1-123" aria-hidden="true" tabindex="-1"></a>The **distribution** of a random variable $X$ describes how the total probability of 100% is split across all the possible values of $X$, and it fully defines a random variable. If you know the distribution of a random variable you can:</span>
+<span id="cb1-124"><a href="#cb1-124" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-125"><a href="#cb1-125" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>compute properties of the random variables and derived variables</span>
+<span id="cb1-126"><a href="#cb1-126" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>simulate the random variables by randomly picking values of $X$ according to its distribution using <span class="in">`np.random.choice`</span>, <span class="in">`df.sample`</span>, or <span class="in">`scipy.stats.&lt;dist&gt;.rvs(...)`</span></span>
+<span id="cb1-127"><a href="#cb1-127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-128"><a href="#cb1-128" aria-hidden="true" tabindex="-1"></a>The distribution of a discrete random variable can also be represented using a histogram. If a variable is **continuous**, meaning it can take on infinitely many values, we can illustrate its distribution using a density curve. </span>
+<span id="cb1-129"><a href="#cb1-129" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-130"><a href="#cb1-130" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-131"><a href="#cb1-131" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/discrete_continuous.png" alt='discrete_continuous' width='700'&gt;</span>
+<span id="cb1-132"><a href="#cb1-132" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-133"><a href="#cb1-133" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-134"><a href="#cb1-134" aria-hidden="true" tabindex="-1"></a>We often don’t know the (true) distribution and instead compute an empirical distribution. If you flip a coin 3 times and get {H, H, T}, you may ask —— what is the probability that the coin will land heads? We can come up with an **empirical estimate** of $\frac{2}{3}$, though the true probability might be $\frac{1}{2}$.</span>
+<span id="cb1-135"><a href="#cb1-135" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-136"><a href="#cb1-136" aria-hidden="true" tabindex="-1"></a>Probabilities are areas. For discrete random variables, the *area of the red bars* represents the probability that a discrete random variable $X$ falls within those values. For continuous random variables, the *area under the curve* represents the probability that a discrete random variable $Y$ falls within those values.</span>
+<span id="cb1-137"><a href="#cb1-137" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-138"><a href="#cb1-138" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-139"><a href="#cb1-139" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/probability_areas.png" alt='discrete_continuous' width='600'&gt;</span>
+<span id="cb1-140"><a href="#cb1-140" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-141"><a href="#cb1-141" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-142"><a href="#cb1-142" aria-hidden="true" tabindex="-1"></a>If we sum up the total area of the bars/under the density curve, we should get 100%, or 1.</span>
+<span id="cb1-143"><a href="#cb1-143" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-144"><a href="#cb1-144" aria-hidden="true" tabindex="-1"></a>We can show the distribution of $Y$ in the following tables. The table on the left lists all possible samples of $s$ and the number of times they can appear ($Y(s)$). We can use this to calculate the values for the table on the right, a **probability distribution table**. </span>
+<span id="cb1-145"><a href="#cb1-145" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-146"><a href="#cb1-146" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-147"><a href="#cb1-147" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/distribution.png" alt='distribution' width='600'&gt;</span>
+<span id="cb1-148"><a href="#cb1-148" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-149"><a href="#cb1-149" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-150"><a href="#cb1-150" aria-hidden="true" tabindex="-1"></a>Rather than fully write out a probability distribution or show a histogram, there are some common distributions that come up frequently when doing data science. These distributions are specified by some **parameters**, which are constants that specify the shape of the distribution. In terms of notation, the '~' means "has the probability distribution of". </span>
+<span id="cb1-151"><a href="#cb1-151" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-152"><a href="#cb1-152" aria-hidden="true" tabindex="-1"></a>These common distributions are listed below:</span>
+<span id="cb1-153"><a href="#cb1-153" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-154"><a href="#cb1-154" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Bernoulli($p$): If $X$ ~ Bernoulli($p$), then $X$ takes on a value 1 with probability $p$, and 0 with probability $1 - p$. Bernoulli random variables are also termed the "indicator" random variables.</span>
+<span id="cb1-155"><a href="#cb1-155" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Binomial($n$, $p$): If $X$ ~ Binomial($n$, $p$), then $X$ counts the number of 1s in $n$ independent Bernoulli($p$) trials.</span>
+<span id="cb1-156"><a href="#cb1-156" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>Categorical($p_1, ..., p_k$) of values: The probability of each value is 1 / (number of possible values).</span>
+<span id="cb1-157"><a href="#cb1-157" aria-hidden="true" tabindex="-1"></a><span class="ss">4. </span>Uniform on the unit interval (0, 1): The density is flat at 1 on (0, 1) and 0 elsewhere. We won't get into what density means as much here, but intuitively, this is saying that there's an equally likely chance of getting any value on the interval (0, 1).</span>
+<span id="cb1-158"><a href="#cb1-158" aria-hidden="true" tabindex="-1"></a><span class="ss">5. </span>Normal($\mu$, $\sigma^2$): The probability density is specified by $\frac{1}{\sqrt{2\pi}}e^{-\frac{1}{2}\frac{(x-\mu)^2}{\sigma^2}}$. This bell-shaped distribution comes up fairly often in data, in part due to the Central Limit Theorem you saw back in Data 8.</span>
+<span id="cb1-159"><a href="#cb1-159" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-160"><a href="#cb1-160" aria-hidden="true" tabindex="-1"></a><span class="fu">## Expectation and Variance</span></span>
+<span id="cb1-161"><a href="#cb1-161" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-162"><a href="#cb1-162" aria-hidden="true" tabindex="-1"></a>There are several ways to describe a random variable. The methods shown above —— a table of all samples $s, X(s)$, distribution table $P(X=x)$, and histograms —— are all definitions that *fully describe* a random variable. Often, it is easier to describe a random variable using some *numerical summary* rather than fully defining its distribution. These numerical summaries are numbers that characterize some properties of the random variable. Because they give a "summary" of how the variable tends to behave, they are *not* random. Instead, think of them as a static number that describes a certain property of the random variable. In Data 100, we will focus our attention on the expectation and variance of a random variable.</span>
+<span id="cb1-163"><a href="#cb1-163" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-164"><a href="#cb1-164" aria-hidden="true" tabindex="-1"></a><span class="fu">### Expectation</span></span>
+<span id="cb1-165"><a href="#cb1-165" aria-hidden="true" tabindex="-1"></a>The **expectation** of a random variable $X$ is the **weighted average** of the values of $X$, where the weights are the probabilities of each value occurring. There are two equivalent ways to compute the expectation: </span>
+<span id="cb1-166"><a href="#cb1-166" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-167"><a href="#cb1-167" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Apply the weights one *sample* at a time: $$\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = \sum_{\text{all possible } s} X(s) P(s)$$.</span>
+<span id="cb1-168"><a href="#cb1-168" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Apply the weights one possible *value* at a time: $$\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = \sum_{\text{all possible } x} x P(X=x)$$</span>
+<span id="cb1-169"><a href="#cb1-169" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-170"><a href="#cb1-170" aria-hidden="true" tabindex="-1"></a>The latter is more commonly used as we are usually just given the distribution, not all possible samples.</span>
+<span id="cb1-171"><a href="#cb1-171" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-172"><a href="#cb1-172" aria-hidden="true" tabindex="-1"></a>We want to emphasize that the expectation is a *number*, not a random variable. Expectation is a generalization of the average, and it has the same units as the random variable. It is also the center of gravity of the probability distribution histogram, meaning if we simulate the variable many times, it is the long-run average of the simulated values.</span>
+<span id="cb1-173"><a href="#cb1-173" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-174"><a href="#cb1-174" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Example 1: Coin Toss</span></span>
+<span id="cb1-175"><a href="#cb1-175" aria-hidden="true" tabindex="-1"></a>Going back to our coin toss example, we define a random variable $X$ as: </span>
+<span id="cb1-176"><a href="#cb1-176" aria-hidden="true" tabindex="-1"></a>$$X = \begin{cases} </span>
+<span id="cb1-177"><a href="#cb1-177" aria-hidden="true" tabindex="-1"></a>      1, \text{if the coin lands heads} <span class="sc">\\</span></span>
+<span id="cb1-178"><a href="#cb1-178" aria-hidden="true" tabindex="-1"></a>      0, \text{if the coin lands tails} </span>
+<span id="cb1-179"><a href="#cb1-179" aria-hidden="true" tabindex="-1"></a>   \end{cases}$$</span>
+<span id="cb1-180"><a href="#cb1-180" aria-hidden="true" tabindex="-1"></a>   </span>
+<span id="cb1-181"><a href="#cb1-181" aria-hidden="true" tabindex="-1"></a>We can calculate its expectation $\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span>$ using the second method of applying the weights one possible value at a time: </span>
+<span id="cb1-182"><a href="#cb1-182" aria-hidden="true" tabindex="-1"></a>$$\begin{align}</span>
+<span id="cb1-183"><a href="#cb1-183" aria-hidden="true" tabindex="-1"></a> \mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> &amp;= \sum_{x} x P(X=x) <span class="sc">\\</span></span>
+<span id="cb1-184"><a href="#cb1-184" aria-hidden="true" tabindex="-1"></a> &amp;= 1 * 0.5 + 0 * 0.5 <span class="sc">\\</span></span>
+<span id="cb1-185"><a href="#cb1-185" aria-hidden="true" tabindex="-1"></a> &amp;= 0.5</span>
+<span id="cb1-186"><a href="#cb1-186" aria-hidden="true" tabindex="-1"></a>\end{align}$$</span>
+<span id="cb1-187"><a href="#cb1-187" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-188"><a href="#cb1-188" aria-hidden="true" tabindex="-1"></a>Note that $\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = 0.5$ is not a possible value of $X$; it's an average. **The expectation of X does not need to be a possible value of X**.</span>
+<span id="cb1-189"><a href="#cb1-189" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-190"><a href="#cb1-190" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Example 2</span></span>
+<span id="cb1-191"><a href="#cb1-191" aria-hidden="true" tabindex="-1"></a>Consider the random variable $X$: </span>
+<span id="cb1-192"><a href="#cb1-192" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-193"><a href="#cb1-193" aria-hidden="true" tabindex="-1"></a>| $x$ | $P(X=x)$ | </span>
+<span id="cb1-194"><a href="#cb1-194" aria-hidden="true" tabindex="-1"></a>| --- | -------- |</span>
+<span id="cb1-195"><a href="#cb1-195" aria-hidden="true" tabindex="-1"></a>| 3 | 0.1 | </span>
+<span id="cb1-196"><a href="#cb1-196" aria-hidden="true" tabindex="-1"></a>| 4 | 0.2 |</span>
+<span id="cb1-197"><a href="#cb1-197" aria-hidden="true" tabindex="-1"></a>| 6 | 0.4 | </span>
+<span id="cb1-198"><a href="#cb1-198" aria-hidden="true" tabindex="-1"></a>| 8 | 0.3 |</span>
+<span id="cb1-199"><a href="#cb1-199" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-200"><a href="#cb1-200" aria-hidden="true" tabindex="-1"></a>To calculate it's expectation, </span>
+<span id="cb1-201"><a href="#cb1-201" aria-hidden="true" tabindex="-1"></a>$$\begin{align}</span>
+<span id="cb1-202"><a href="#cb1-202" aria-hidden="true" tabindex="-1"></a> \mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> &amp;= \sum_{x} x P(X=x) <span class="sc">\\</span></span>
+<span id="cb1-203"><a href="#cb1-203" aria-hidden="true" tabindex="-1"></a> &amp;= 3 * 0.1 + 4 * 0.2 + 6 * 0.4 + 8 * 0.3 <span class="sc">\\</span></span>
+<span id="cb1-204"><a href="#cb1-204" aria-hidden="true" tabindex="-1"></a> &amp;= 0.3 + 0.8 + 2.4 + 2.4 <span class="sc">\\</span></span>
+<span id="cb1-205"><a href="#cb1-205" aria-hidden="true" tabindex="-1"></a> &amp;= 5.9</span>
+<span id="cb1-206"><a href="#cb1-206" aria-hidden="true" tabindex="-1"></a>\end{align}$$</span>
+<span id="cb1-207"><a href="#cb1-207" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-208"><a href="#cb1-208" aria-hidden="true" tabindex="-1"></a>Again, note that $\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = 5.9$ is not a possible value of $X$; it's an average. **The expectation of X does not need to be a possible value of X**.</span>
+<span id="cb1-209"><a href="#cb1-209" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-210"><a href="#cb1-210" aria-hidden="true" tabindex="-1"></a><span class="fu">### Variance</span></span>
+<span id="cb1-211"><a href="#cb1-211" aria-hidden="true" tabindex="-1"></a>The **variance** of a random variable is a measure of its chance error. It is defined as the expected squared deviation from the expectation of $X$. Put more simply, variance asks: how far does $X$ typically vary from its average value, just by chance? What is the spread of $X$'s distribution?</span>
+<span id="cb1-212"><a href="#cb1-212" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-213"><a href="#cb1-213" aria-hidden="true" tabindex="-1"></a>$$\text{Var}(X) = \mathbb{E}<span class="co">[</span><span class="ot">(X-\mathbb{E}[X])^2</span><span class="co">]</span>$$</span>
+<span id="cb1-214"><a href="#cb1-214" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-215"><a href="#cb1-215" aria-hidden="true" tabindex="-1"></a>The units of variance are the square of the units of $X$. To get it back to the right scale, use the standard deviation of $X$: $$\text{SD}(X) = \sqrt{\text{Var}(X)}$$</span>
+<span id="cb1-216"><a href="#cb1-216" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-217"><a href="#cb1-217" aria-hidden="true" tabindex="-1"></a>Like with expectation, **variance and standard deviation are numbers, not random variables**! Variance helps us describe the variability of a random variable. It is the expected squared error between the random variable and its expected value. As you will see shortly, we can use variance to help us quantify the chance error that arises when using a sample $X$ to estimate the population mean.</span>
+<span id="cb1-218"><a href="#cb1-218" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-219"><a href="#cb1-219" aria-hidden="true" tabindex="-1"></a>By <span class="co">[</span><span class="ot">Chebyshev’s inequality</span><span class="co">](https://www.inferentialthinking.com/chapters/14/2/Variability.html#Chebychev's-Bounds)</span>, which you saw in Data 8, no matter what the shape of the distribution of $X$ is, the vast majority of the probability lies in the interval “expectation plus or minus a few SDs.”</span>
+<span id="cb1-220"><a href="#cb1-220" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-221"><a href="#cb1-221" aria-hidden="true" tabindex="-1"></a>If we expand the square and use properties of expectation, we can re-express variance as the **computational formula for variance**.</span>
+<span id="cb1-222"><a href="#cb1-222" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-223"><a href="#cb1-223" aria-hidden="true" tabindex="-1"></a>$$\text{Var}(X) = \mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> - (\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span>)^2$$</span>
+<span id="cb1-224"><a href="#cb1-224" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-225"><a href="#cb1-225" aria-hidden="true" tabindex="-1"></a>This form is often more convenient to use when computing the variance of a variable by hand, and it is also useful in Mean Squared Error calculations, as $\mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> = \text{Var}(X)$ if $X$ is centered and $E(X)=0$.</span>
+<span id="cb1-226"><a href="#cb1-226" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-227"><a href="#cb1-227" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collapse="true"}</span>
+<span id="cb1-228"><a href="#cb1-228" aria-hidden="true" tabindex="-1"></a><span class="fu">## Proof</span></span>
+<span id="cb1-229"><a href="#cb1-229" aria-hidden="true" tabindex="-1"></a>$$\begin{align}</span>
+<span id="cb1-230"><a href="#cb1-230" aria-hidden="true" tabindex="-1"></a>   \text{Var}(X) &amp;= \mathbb{E}<span class="co">[</span><span class="ot">(X-\mathbb{E}[X])^2</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-231"><a href="#cb1-231" aria-hidden="true" tabindex="-1"></a>   &amp;= \mathbb{E}(X^2 - 2X\mathbb{E}(X) + (\mathbb{E}(X))^2) <span class="sc">\\</span></span>
+<span id="cb1-232"><a href="#cb1-232" aria-hidden="true" tabindex="-1"></a>   &amp;= \mathbb{E}(X^2) - 2 \mathbb{E}(X)\mathbb{E}(X) +( \mathbb{E}(X))^2<span class="sc">\\</span></span>
+<span id="cb1-233"><a href="#cb1-233" aria-hidden="true" tabindex="-1"></a>   &amp;= \mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> - (\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span>)^2</span>
+<span id="cb1-234"><a href="#cb1-234" aria-hidden="true" tabindex="-1"></a>\end{align}$$</span>
+<span id="cb1-235"><a href="#cb1-235" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-236"><a href="#cb1-236" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-237"><a href="#cb1-237" aria-hidden="true" tabindex="-1"></a>How do we compute $\mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span>$? Any function of a random variable is *also* a random variable. That means that by squaring $X$, we've created a new random variable. To compute $\mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span>$, we can simply apply our definition of expectation to the random variable $X^2$.</span>
+<span id="cb1-238"><a href="#cb1-238" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-239"><a href="#cb1-239" aria-hidden="true" tabindex="-1"></a>$$\mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> = \sum_{x} x^2 P(X = x)$$ </span>
+<span id="cb1-240"><a href="#cb1-240" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-241"><a href="#cb1-241" aria-hidden="true" tabindex="-1"></a><span class="fu">### Example: Die</span></span>
+<span id="cb1-242"><a href="#cb1-242" aria-hidden="true" tabindex="-1"></a>Let $X$ be the outcome of a single fair die roll. $X$ is a random variable defined as </span>
+<span id="cb1-243"><a href="#cb1-243" aria-hidden="true" tabindex="-1"></a>$$X = \begin{cases} </span>
+<span id="cb1-244"><a href="#cb1-244" aria-hidden="true" tabindex="-1"></a>      \frac{1}{6}, \text{if } x \in <span class="sc">\{</span>1,2,3,4,5,6<span class="sc">\}</span> <span class="sc">\\</span></span>
+<span id="cb1-245"><a href="#cb1-245" aria-hidden="true" tabindex="-1"></a>      0, \text{otherwise} </span>
+<span id="cb1-246"><a href="#cb1-246" aria-hidden="true" tabindex="-1"></a>   \end{cases}$$</span>
+<span id="cb1-247"><a href="#cb1-247" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-248"><a href="#cb1-248" aria-hidden="true" tabindex="-1"></a>::: {.callout-caution collapse="true"}</span>
+<span id="cb1-249"><a href="#cb1-249" aria-hidden="true" tabindex="-1"></a><span class="fu">## What's the expectation, $\mathbb{E}[X]?$</span></span>
+<span id="cb1-250"><a href="#cb1-250" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-251"><a href="#cb1-251" aria-hidden="true" tabindex="-1"></a>$$ \begin{align} </span>
+<span id="cb1-252"><a href="#cb1-252" aria-hidden="true" tabindex="-1"></a>         \mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> &amp;= 1\big(\frac{1}{6}\big) + 2\big(\frac{1}{6}\big) + 3\big(\frac{1}{6}\big) + 4\big(\frac{1}{6}\big) + 5\big(\frac{1}{6}\big) + 6\big(\frac{1}{6}\big) <span class="sc">\\</span></span>
+<span id="cb1-253"><a href="#cb1-253" aria-hidden="true" tabindex="-1"></a>         &amp;= \big(\frac{1}{6}\big)( 1 + 2 + 3 + 4 + 5 + 6) <span class="sc">\\</span></span>
+<span id="cb1-254"><a href="#cb1-254" aria-hidden="true" tabindex="-1"></a>         &amp;= \frac{7}{2}</span>
+<span id="cb1-255"><a href="#cb1-255" aria-hidden="true" tabindex="-1"></a>      \end{align}$$</span>
+<span id="cb1-256"><a href="#cb1-256" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-257"><a href="#cb1-257" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-258"><a href="#cb1-258" aria-hidden="true" tabindex="-1"></a>::: {.callout-caution collapse="true"}</span>
+<span id="cb1-259"><a href="#cb1-259" aria-hidden="true" tabindex="-1"></a><span class="fu">## What's the variance, $\text{Var}(X)?$</span></span>
+<span id="cb1-260"><a href="#cb1-260" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-261"><a href="#cb1-261" aria-hidden="true" tabindex="-1"></a>Using Approach 1 (definition): </span>
+<span id="cb1-262"><a href="#cb1-262" aria-hidden="true" tabindex="-1"></a>   $$\begin{align} </span>
+<span id="cb1-263"><a href="#cb1-263" aria-hidden="true" tabindex="-1"></a>      \text{Var}(X) &amp;= \big(\frac{1}{6}\big)((1 - \frac{7}{2})^2 + (2 - \frac{7}{2})^2 + (3 - \frac{7}{2})^2 + (4 - \frac{7}{2})^2 + (5 - \frac{7}{2})^2 + (6 - \frac{7}{2})^2) <span class="sc">\\</span></span>
+<span id="cb1-264"><a href="#cb1-264" aria-hidden="true" tabindex="-1"></a>      &amp;= \frac{35}{12}</span>
+<span id="cb1-265"><a href="#cb1-265" aria-hidden="true" tabindex="-1"></a>   \end{align}$$</span>
+<span id="cb1-266"><a href="#cb1-266" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-267"><a href="#cb1-267" aria-hidden="true" tabindex="-1"></a>Using Approach 2 (property): </span>
+<span id="cb1-268"><a href="#cb1-268" aria-hidden="true" tabindex="-1"></a>$$\mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> = \sum_{x} x^2 P(X = x) = \frac{91}{6}$$</span>
+<span id="cb1-269"><a href="#cb1-269" aria-hidden="true" tabindex="-1"></a>$$\text{Var}(X) = \frac{91}{6} - (\frac{7}{2})^2 = \frac{35}{12}$$</span>
+<span id="cb1-270"><a href="#cb1-270" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-271"><a href="#cb1-271" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-272"><a href="#cb1-272" aria-hidden="true" tabindex="-1"></a>We can summarize our discussion so far in the following diagram:</span>
+<span id="cb1-273"><a href="#cb1-273" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-274"><a href="#cb1-274" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-275"><a href="#cb1-275" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-276"><a href="#cb1-276" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/exp_var.png" alt='distribution' width='800'&gt;</span>
+<span id="cb1-277"><a href="#cb1-277" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-278"><a href="#cb1-278" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-279"><a href="#cb1-279" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-280"><a href="#cb1-280" aria-hidden="true" tabindex="-1"></a><span class="fu">## Sums of Random Variables</span></span>
+<span id="cb1-281"><a href="#cb1-281" aria-hidden="true" tabindex="-1"></a>Often, we will work with multiple random variables at the same time. A function of a random variable is also a random variable. If you create multiple random variables based on your sample, then functions of those random variables are also random variables.</span>
+<span id="cb1-282"><a href="#cb1-282" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-283"><a href="#cb1-283" aria-hidden="true" tabindex="-1"></a>For example, if $X_1, X_2, ..., X_n$ are random variables, then so are all of these: </span>
+<span id="cb1-284"><a href="#cb1-284" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-285"><a href="#cb1-285" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$X_n^2$</span>
+<span id="cb1-286"><a href="#cb1-286" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$<span class="sc">\#\{</span>i : X_i &gt; 10<span class="sc">\}</span>$</span>
+<span id="cb1-287"><a href="#cb1-287" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\text{max}(X_1, X_2, ..., X_n)$</span>
+<span id="cb1-288"><a href="#cb1-288" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\frac{1}{n} \sum_{i=1}^n (X_i - c)^2$</span>
+<span id="cb1-289"><a href="#cb1-289" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$\frac{1}{n} \sum_{i=1}^n X_i$</span>
+<span id="cb1-290"><a href="#cb1-290" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-291"><a href="#cb1-291" aria-hidden="true" tabindex="-1"></a>Many functions of random variables that we are interested in (e.g., counts, means) involve sums of random variables, so let's dive deeper into the properties of sums of random variables.</span>
+<span id="cb1-292"><a href="#cb1-292" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-293"><a href="#cb1-293" aria-hidden="true" tabindex="-1"></a><span class="fu">### Properties of Expectation </span></span>
+<span id="cb1-294"><a href="#cb1-294" aria-hidden="true" tabindex="-1"></a>Instead of simulating full distributions, we often just compute expectation and variance directly. Recall the definition of expectation: $$\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = \sum_{x} x P(X=x)$$</span>
+<span id="cb1-295"><a href="#cb1-295" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-296"><a href="#cb1-296" aria-hidden="true" tabindex="-1"></a>From it, we can derive some useful properties: </span>
+<span id="cb1-297"><a href="#cb1-297" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-298"><a href="#cb1-298" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>**Linearity of expectation**. The expectation of the linear transformation $aX+b$, where $a$ and $b$ are constants, is:</span>
+<span id="cb1-299"><a href="#cb1-299" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-300"><a href="#cb1-300" aria-hidden="true" tabindex="-1"></a>$$\mathbb{E}<span class="co">[</span><span class="ot">aX+b</span><span class="co">]</span> = aE<span class="co">[</span><span class="ot">\mathbb{X}</span><span class="co">]</span> + b$$</span>
+<span id="cb1-301"><a href="#cb1-301" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-302"><a href="#cb1-302" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collapse="true"}</span>
+<span id="cb1-303"><a href="#cb1-303" aria-hidden="true" tabindex="-1"></a><span class="fu">## Proof</span></span>
+<span id="cb1-304"><a href="#cb1-304" aria-hidden="true" tabindex="-1"></a>$$\begin{align}</span>
+<span id="cb1-305"><a href="#cb1-305" aria-hidden="true" tabindex="-1"></a>        \mathbb{E}<span class="co">[</span><span class="ot">aX+b</span><span class="co">]</span> &amp;= \sum_{x} (ax + b) P(X=x) <span class="sc">\\</span></span>
+<span id="cb1-306"><a href="#cb1-306" aria-hidden="true" tabindex="-1"></a>        &amp;= \sum_{x} (ax P(X=x) + bP(X=x)) <span class="sc">\\</span></span>
+<span id="cb1-307"><a href="#cb1-307" aria-hidden="true" tabindex="-1"></a>        &amp;= a\sum_{x}P(X=x) + b\sum_{x}P(X=x)<span class="sc">\\</span></span>
+<span id="cb1-308"><a href="#cb1-308" aria-hidden="true" tabindex="-1"></a>        &amp;= a\mathbb{E}(X) + b * 1</span>
+<span id="cb1-309"><a href="#cb1-309" aria-hidden="true" tabindex="-1"></a>    \end{align}$$</span>
+<span id="cb1-310"><a href="#cb1-310" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-311"><a href="#cb1-311" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-312"><a href="#cb1-312" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Expectation is also linear in *sums* of random variables. </span>
+<span id="cb1-313"><a href="#cb1-313" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-314"><a href="#cb1-314" aria-hidden="true" tabindex="-1"></a>$$\mathbb{E}<span class="co">[</span><span class="ot">X+Y</span><span class="co">]</span> = \mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> + \mathbb{E}<span class="co">[</span><span class="ot">Y</span><span class="co">]</span>$$</span>
+<span id="cb1-315"><a href="#cb1-315" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-316"><a href="#cb1-316" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collapse="true"}</span>
+<span id="cb1-317"><a href="#cb1-317" aria-hidden="true" tabindex="-1"></a><span class="fu">## Proof</span></span>
+<span id="cb1-318"><a href="#cb1-318" aria-hidden="true" tabindex="-1"></a>$$\begin{align}</span>
+<span id="cb1-319"><a href="#cb1-319" aria-hidden="true" tabindex="-1"></a>    \mathbb{E}<span class="co">[</span><span class="ot">X+Y</span><span class="co">]</span> &amp;= \sum_{s} (X+Y)(s) P(s) <span class="sc">\\</span></span>
+<span id="cb1-320"><a href="#cb1-320" aria-hidden="true" tabindex="-1"></a>    &amp;= \sum_{s} (X(s)P(s) + Y(s)P(s)) <span class="sc">\\</span></span>
+<span id="cb1-321"><a href="#cb1-321" aria-hidden="true" tabindex="-1"></a>    &amp;= \sum_{s} X(s)P(s) + \sum_{s} Y(s)P(s)<span class="sc">\\</span></span>
+<span id="cb1-322"><a href="#cb1-322" aria-hidden="true" tabindex="-1"></a>    &amp;= \mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> + \mathbb{E}<span class="co">[</span><span class="ot">Y</span><span class="co">]</span></span>
+<span id="cb1-323"><a href="#cb1-323" aria-hidden="true" tabindex="-1"></a>\end{align}$$</span>
+<span id="cb1-324"><a href="#cb1-324" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-325"><a href="#cb1-325" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-326"><a href="#cb1-326" aria-hidden="true" tabindex="-1"></a><span class="ss">3. </span>If $g$ is a non-linear function, then in general, </span>
+<span id="cb1-327"><a href="#cb1-327" aria-hidden="true" tabindex="-1"></a>$$\mathbb{E}<span class="co">[</span><span class="ot">g(X)</span><span class="co">]</span> \neq g(\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span>)$$ For example, if $X$ is -1 or 1 with equal probability, then $\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = 0$, but $\mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> = 1 \neq 0$.</span>
+<span id="cb1-328"><a href="#cb1-328" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-329"><a href="#cb1-329" aria-hidden="true" tabindex="-1"></a><span class="fu">### Properties of Variance</span></span>
+<span id="cb1-330"><a href="#cb1-330" aria-hidden="true" tabindex="-1"></a>Let's now get into the properties of variance. Recall the definition of variance: $$\text{Var}(X) = \mathbb{E}<span class="co">[</span><span class="ot">(X-\mathbb{E}[X])^2</span><span class="co">]</span>$$</span>
+<span id="cb1-331"><a href="#cb1-331" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-332"><a href="#cb1-332" aria-hidden="true" tabindex="-1"></a>Combining it with the properties of expectation, we can derive some useful properties: </span>
+<span id="cb1-333"><a href="#cb1-333" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-334"><a href="#cb1-334" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Unlike expectation, variance is *non-linear*. The variance of the linear transformation $aX+b$ is:</span>
+<span id="cb1-335"><a href="#cb1-335" aria-hidden="true" tabindex="-1"></a>$$\text{Var}(aX+b) = a^2 \text{Var}(X)$$</span>
+<span id="cb1-336"><a href="#cb1-336" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-337"><a href="#cb1-337" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Subsequently, $$\text{SD}(aX+b) = |a| \text{SD}(X)$$</span>
+<span id="cb1-338"><a href="#cb1-338" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>The full proof of this fact can be found using the definition of variance. As general intuition, consider that $aX+b$ scales the variable $X$ by a factor of $a$, then shifts the distribution of $X$ by $b$ units. </span>
+<span id="cb1-339"><a href="#cb1-339" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-340"><a href="#cb1-340" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collapse="true"}</span>
+<span id="cb1-341"><a href="#cb1-341" aria-hidden="true" tabindex="-1"></a><span class="fu">## Proof</span></span>
+<span id="cb1-342"><a href="#cb1-342" aria-hidden="true" tabindex="-1"></a>We know that $$\mathbb{E}<span class="co">[</span><span class="ot">aX+b</span><span class="co">]</span> = aE<span class="co">[</span><span class="ot">\mathbb{X}</span><span class="co">]</span> + b$$</span>
+<span id="cb1-343"><a href="#cb1-343" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-344"><a href="#cb1-344" aria-hidden="true" tabindex="-1"></a>In order to compute $\text{Var}(aX+b)$, consider that a shift by $b$ units does not affect spread, so $\text{Var}(aX+b) = \text{Var}(aX)$.</span>
+<span id="cb1-345"><a href="#cb1-345" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-346"><a href="#cb1-346" aria-hidden="true" tabindex="-1"></a>Then, </span>
+<span id="cb1-347"><a href="#cb1-347" aria-hidden="true" tabindex="-1"></a>$$\begin{align}</span>
+<span id="cb1-348"><a href="#cb1-348" aria-hidden="true" tabindex="-1"></a>    \text{Var}(aX+b) &amp;= \text{Var}(aX) <span class="sc">\\</span></span>
+<span id="cb1-349"><a href="#cb1-349" aria-hidden="true" tabindex="-1"></a>    &amp;= E((aX)^2) - (E(aX))^2 <span class="sc">\\</span></span>
+<span id="cb1-350"><a href="#cb1-350" aria-hidden="true" tabindex="-1"></a>    &amp;= E(a^2 X^2) - (aE(X))^2<span class="sc">\\</span></span>
+<span id="cb1-351"><a href="#cb1-351" aria-hidden="true" tabindex="-1"></a>    &amp;= a^2 (E(X^2) - (E(X))^2) <span class="sc">\\</span></span>
+<span id="cb1-352"><a href="#cb1-352" aria-hidden="true" tabindex="-1"></a>    &amp;= a^2 \text{Var}(X)</span>
+<span id="cb1-353"><a href="#cb1-353" aria-hidden="true" tabindex="-1"></a>\end{align}$$</span>
+<span id="cb1-354"><a href="#cb1-354" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-355"><a href="#cb1-355" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-356"><a href="#cb1-356" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Shifting the distribution by $b$ *does not* impact the *spread* of the distribution. Thus, $\text{Var}(aX+b) = \text{Var}(aX)$.</span>
+<span id="cb1-357"><a href="#cb1-357" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Scaling the distribution by $a$ *does* impact the spread of the distribution.</span>
+<span id="cb1-358"><a href="#cb1-358" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-359"><a href="#cb1-359" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-360"><a href="#cb1-360" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/transformation.png" alt='transformation' width='600'&gt;</span>
+<span id="cb1-361"><a href="#cb1-361" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-362"><a href="#cb1-362" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-363"><a href="#cb1-363" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Variance of sums of random variables is affected by the (in)dependence of the random variables.</span>
+<span id="cb1-364"><a href="#cb1-364" aria-hidden="true" tabindex="-1"></a>$$\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) + 2\text{cov}(X,Y)$$</span>
+<span id="cb1-365"><a href="#cb1-365" aria-hidden="true" tabindex="-1"></a>$$\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) \qquad \text{if } X, Y \text{ independent}$$</span>
+<span id="cb1-366"><a href="#cb1-366" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-367"><a href="#cb1-367" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-368"><a href="#cb1-368" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collapse="true"}</span>
+<span id="cb1-369"><a href="#cb1-369" aria-hidden="true" tabindex="-1"></a><span class="fu">## Proof</span></span>
+<span id="cb1-370"><a href="#cb1-370" aria-hidden="true" tabindex="-1"></a>The variance of a sum is affected by the dependence between the two random variables that are being added. Let’s expand the definition of $\text{Var}(X + Y)$ to see what’s going on.</span>
+<span id="cb1-371"><a href="#cb1-371" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-372"><a href="#cb1-372" aria-hidden="true" tabindex="-1"></a>To simplify the math, let $\mu_x = \mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span>$ and $\mu_y = \mathbb{E}<span class="co">[</span><span class="ot">Y</span><span class="co">]</span>$.</span>
+<span id="cb1-373"><a href="#cb1-373" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-374"><a href="#cb1-374" aria-hidden="true" tabindex="-1"></a>$$ \begin{align}</span>
+<span id="cb1-375"><a href="#cb1-375" aria-hidden="true" tabindex="-1"></a>\text{Var}(X + Y) &amp;= \mathbb{E}<span class="co">[</span><span class="ot">(X+Y- \mathbb{E}(X+Y))^2</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-376"><a href="#cb1-376" aria-hidden="true" tabindex="-1"></a>&amp;= \mathbb{E}<span class="co">[</span><span class="ot">((X - \mu_x) + (Y - \mu_y))^2</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-377"><a href="#cb1-377" aria-hidden="true" tabindex="-1"></a>&amp;= \mathbb{E}<span class="co">[</span><span class="ot">(X - \mu_x)^2 + 2(X - \mu_x)(Y - \mu_y) + (Y - \mu_y)^2</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-378"><a href="#cb1-378" aria-hidden="true" tabindex="-1"></a>&amp;= \mathbb{E}<span class="co">[</span><span class="ot">(X - \mu_x)^2</span><span class="co">]</span> + \mathbb{E}<span class="co">[</span><span class="ot">(Y - \mu_y)^2</span><span class="co">]</span> + \mathbb{E}<span class="co">[</span><span class="ot">(X - \mu_x)(Y - \mu_y)</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-379"><a href="#cb1-379" aria-hidden="true" tabindex="-1"></a>&amp;= \text{Var}(X) + \text{Var}(Y) + \mathbb{E}<span class="co">[</span><span class="ot">(X - \mu_x)(Y - \mu_y)</span><span class="co">]</span> </span>
+<span id="cb1-380"><a href="#cb1-380" aria-hidden="true" tabindex="-1"></a>\end{align}$$</span>
+<span id="cb1-381"><a href="#cb1-381" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-382"><a href="#cb1-382" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-383"><a href="#cb1-383" aria-hidden="true" tabindex="-1"></a><span class="fu">### Covariance and Correlation</span></span>
+<span id="cb1-384"><a href="#cb1-384" aria-hidden="true" tabindex="-1"></a>We define the **covariance** of two random variables as the expected product of deviations from expectation. Put more simply, covariance is a generalization of variance to variance: </span>
+<span id="cb1-385"><a href="#cb1-385" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-386"><a href="#cb1-386" aria-hidden="true" tabindex="-1"></a>$$\text{Cov}(X, X) = \mathbb{E}<span class="co">[</span><span class="ot">(X - \mathbb{E}[X])^2</span><span class="co">]</span> = \text{Var}(X)$$</span>
+<span id="cb1-387"><a href="#cb1-387" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-388"><a href="#cb1-388" aria-hidden="true" tabindex="-1"></a>$$\text{Cov}(X, Y) = \mathbb{E}<span class="co">[</span><span class="ot">(X - \mathbb{E}[X])(Y - \mathbb{E}[Y])</span><span class="co">]</span>$$</span>
+<span id="cb1-389"><a href="#cb1-389" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-390"><a href="#cb1-390" aria-hidden="true" tabindex="-1"></a>We can treat the covariance as a measure of association. Remember the definition of correlation given when we first established SLR?</span>
+<span id="cb1-391"><a href="#cb1-391" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-392"><a href="#cb1-392" aria-hidden="true" tabindex="-1"></a>$$r(X, Y) = \mathbb{E}\left<span class="co">[</span><span class="ot">\left(\frac{X-\mathbb{E}[X]}{\text{SD}(X)}\right)\left(\frac{Y-\mathbb{E}[Y]}{\text{SD}(Y)}\right)\right</span><span class="co">]</span> = \frac{\text{Cov}(X, Y)}{\text{SD}(X)\text{SD}(Y)}$$</span>
+<span id="cb1-393"><a href="#cb1-393" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-394"><a href="#cb1-394" aria-hidden="true" tabindex="-1"></a>It turns out we've been quietly using covariance for some time now! If $X$ and $Y$ are independent, then $\text{Cov}(X, Y) =0$ and $r(X, Y) = 0$. Note, however, that the converse is not always true: $X$ and $Y$ could have $\text{Cov}(X, Y) = r(X, Y) = 0$ but not be independent. </span>
+<span id="cb1-395"><a href="#cb1-395" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-396"><a href="#cb1-396" aria-hidden="true" tabindex="-1"></a><span class="fu">### Equal vs. Identically Distributed vs. i.i.d</span></span>
+<span id="cb1-397"><a href="#cb1-397" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-398"><a href="#cb1-398" aria-hidden="true" tabindex="-1"></a>Suppose that we have two random variables $X$ and $Y$:</span>
+<span id="cb1-399"><a href="#cb1-399" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-400"><a href="#cb1-400" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$X$ and $Y$ are **equal** if $X(s) = Y(s)$ for every sample $s$. Regardless of the exact sample drawn, $X$ is always equal to $Y$.</span>
+<span id="cb1-401"><a href="#cb1-401" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$X$ and $Y$ are **identically distributed** if the distribution of $X$ is equal to the distribution of $Y$. We say “$X$ and $Y$ are equal in distribution.” That is, $X$ and $Y$ take on the same set of possible values, and each of these possible values is taken with the same probability. On any specific sample $s$, identically distributed variables do *not* necessarily share the same value. If $X = Y$, then $X$ and $Y$ are identically distributed; however, the converse is not true (ex: $Y = 7 - X$, $X$ is a die)</span>
+<span id="cb1-402"><a href="#cb1-402" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>$X$ and $Y$ are **independent and identically distributed (i.i.d)** if </span>
+<span id="cb1-403"><a href="#cb1-403" aria-hidden="true" tabindex="-1"></a><span class="ss">    1. </span>The variables are identically distributed. </span>
+<span id="cb1-404"><a href="#cb1-404" aria-hidden="true" tabindex="-1"></a><span class="ss">    2. </span>Knowing the outcome of one variable does not influence our belief of the outcome of the other.</span>
+<span id="cb1-405"><a href="#cb1-405" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-406"><a href="#cb1-406" aria-hidden="true" tabindex="-1"></a>Note that in Data 100, you'll never be expected to prove that random variables are i.i.d.</span>
+<span id="cb1-407"><a href="#cb1-407" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-408"><a href="#cb1-408" aria-hidden="true" tabindex="-1"></a>Now let's walk through an example. Say $X_1$ and $X_2$ be numbers on rolls of two fair die. $X_1$ and $X_2$ are i.i.d, so  $X_1$ and $X_2$ have the same distribution. However, the sums $Y = X_1 + X_1 = 2X_1$ and $Z=X_1+X_2$ have different distributions but the same expectation.</span>
+<span id="cb1-409"><a href="#cb1-409" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-410"><a href="#cb1-410" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-411"><a href="#cb1-411" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/yz_distribution.png" alt='distribution' width='=500'&gt;</span>
+<span id="cb1-412"><a href="#cb1-412" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-413"><a href="#cb1-413" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-414"><a href="#cb1-414" aria-hidden="true" tabindex="-1"></a>However, $Y = X_1$ has a larger variance.</span>
+<span id="cb1-415"><a href="#cb1-415" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-416"><a href="#cb1-416" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-417"><a href="#cb1-417" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/yz.png" alt='distribution' width='200'&gt;</span>
+<span id="cb1-418"><a href="#cb1-418" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-419"><a href="#cb1-419" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-420"><a href="#cb1-420" aria-hidden="true" tabindex="-1"></a><span class="fu">### Example: Bernoulli Random Variable</span></span>
+<span id="cb1-421"><a href="#cb1-421" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-422"><a href="#cb1-422" aria-hidden="true" tabindex="-1"></a>To get some practice with the formulas discussed so far, let's derive the expectation and variance for a Bernoulli($p$) random variable. If $X$ ~ Bernoulli($p$),</span>
+<span id="cb1-423"><a href="#cb1-423" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-424"><a href="#cb1-424" aria-hidden="true" tabindex="-1"></a>$\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = 1 \cdot p + 0 \cdot (1 - p) = p$</span>
+<span id="cb1-425"><a href="#cb1-425" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-426"><a href="#cb1-426" aria-hidden="true" tabindex="-1"></a>To compute the variance, we will use the computational formula. We first find that:</span>
+<span id="cb1-427"><a href="#cb1-427" aria-hidden="true" tabindex="-1"></a>$\mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> = 1^2 \cdot p + 0^2 \cdot (1 - p) = p$</span>
+<span id="cb1-428"><a href="#cb1-428" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-429"><a href="#cb1-429" aria-hidden="true" tabindex="-1"></a>From there, let's calculate our variance: </span>
+<span id="cb1-430"><a href="#cb1-430" aria-hidden="true" tabindex="-1"></a>$\text{Var}(X) = \mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> - \mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span>^2 = p - p^2 = p(1-p)$</span>
+<span id="cb1-431"><a href="#cb1-431" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-432"><a href="#cb1-432" aria-hidden="true" tabindex="-1"></a><span class="fu">### Example: Binomial Random Variable</span></span>
+<span id="cb1-433"><a href="#cb1-433" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-434"><a href="#cb1-434" aria-hidden="true" tabindex="-1"></a>Let $Y$ ~ Binomial($n$, $p$). We can think of $Y$ as being the sum of $n$ i.i.d. Bernoulli($p$) random variables. Mathematically, this translates to </span>
+<span id="cb1-435"><a href="#cb1-435" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-436"><a href="#cb1-436" aria-hidden="true" tabindex="-1"></a>$$Y = \sum_{i=1}^n X_i$$</span>
+<span id="cb1-437"><a href="#cb1-437" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-438"><a href="#cb1-438" aria-hidden="true" tabindex="-1"></a>where $X_i$ is the indicator of a success on trial $i$. </span>
+<span id="cb1-439"><a href="#cb1-439" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-440"><a href="#cb1-440" aria-hidden="true" tabindex="-1"></a>Using linearity of expectation,</span>
+<span id="cb1-441"><a href="#cb1-441" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-442"><a href="#cb1-442" aria-hidden="true" tabindex="-1"></a>$$\mathbb{E}<span class="co">[</span><span class="ot">Y</span><span class="co">]</span> = \sum_{i=1}^n \mathbb{E}<span class="co">[</span><span class="ot">X_i</span><span class="co">]</span> = np$$</span>
+<span id="cb1-443"><a href="#cb1-443" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-444"><a href="#cb1-444" aria-hidden="true" tabindex="-1"></a>For the variance, since each $X_i$ is independent of the other, $\text{Cov}(X_i, X_j) = 0$,</span>
+<span id="cb1-445"><a href="#cb1-445" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-446"><a href="#cb1-446" aria-hidden="true" tabindex="-1"></a>$$\text{Var}(Y) =  \sum_{i=1}^n \text{Var}<span class="co">[</span><span class="ot">X_i</span><span class="co">]</span> = np(1-p)$$</span>
+<span id="cb1-447"><a href="#cb1-447" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-448"><a href="#cb1-448" aria-hidden="true" tabindex="-1"></a><span class="fu">### Summary </span></span>
+<span id="cb1-449"><a href="#cb1-449" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Let $X$ be a random variable with distribution $P(X=x)$. </span>
+<span id="cb1-450"><a href="#cb1-450" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = \sum_{x} x P(X=x)$</span>
+<span id="cb1-451"><a href="#cb1-451" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\text{Var}(X) = \mathbb{E}<span class="co">[</span><span class="ot">(X-\mathbb{E}[X])^2</span><span class="co">]</span> = \mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> - (\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span>)^2$</span>
+<span id="cb1-452"><a href="#cb1-452" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Let $a$ and $b$ be scalar values. </span>
+<span id="cb1-453"><a href="#cb1-453" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\mathbb{E}<span class="co">[</span><span class="ot">aX+b</span><span class="co">]</span> = aE<span class="co">[</span><span class="ot">\mathbb{X}</span><span class="co">]</span> + b$</span>
+<span id="cb1-454"><a href="#cb1-454" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\text{Var}(aX+b) = a^2 \text{Var}(X)$</span>
+<span id="cb1-455"><a href="#cb1-455" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Let $Y$ be another random variable. </span>
+<span id="cb1-456"><a href="#cb1-456" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\mathbb{E}<span class="co">[</span><span class="ot">X+Y</span><span class="co">]</span> = \mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> + \mathbb{E}<span class="co">[</span><span class="ot">Y</span><span class="co">]</span>$</span>
+<span id="cb1-457"><a href="#cb1-457" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) + 2\text{Cov}(X,Y)$</span>
+<span id="cb1-458"><a href="#cb1-458" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-459"><a href="#cb1-459" aria-hidden="true" tabindex="-1"></a>Note that $\text{Cov}(X,Y)$ would equal 0 if $X$ and $Y$ are independent.</span></code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/probability_2/images/CLTdiff.png b/docs/probability_2/images/CLTdiff.png
new file mode 100644
index 000000000..0ce9a27cc
Binary files /dev/null and b/docs/probability_2/images/CLTdiff.png differ
diff --git a/docs/probability_2/images/bias_v_variance.png b/docs/probability_2/images/bias_v_variance.png
new file mode 100644
index 000000000..598e833f3
Binary files /dev/null and b/docs/probability_2/images/bias_v_variance.png differ
diff --git a/docs/probability_2/images/breakdown.png b/docs/probability_2/images/breakdown.png
new file mode 100644
index 000000000..433b6796b
Binary files /dev/null and b/docs/probability_2/images/breakdown.png differ
diff --git a/docs/probability_2/images/bvt.png b/docs/probability_2/images/bvt.png
new file mode 100644
index 000000000..0af708197
Binary files /dev/null and b/docs/probability_2/images/bvt.png differ
diff --git a/docs/probability_2/images/bvt_old.png b/docs/probability_2/images/bvt_old.png
new file mode 100644
index 000000000..9cf5c999c
Binary files /dev/null and b/docs/probability_2/images/bvt_old.png differ
diff --git a/docs/probability_2/images/clt.png b/docs/probability_2/images/clt.png
new file mode 100644
index 000000000..0a93294b5
Binary files /dev/null and b/docs/probability_2/images/clt.png differ
diff --git a/docs/probability_2/images/data.png b/docs/probability_2/images/data.png
new file mode 100644
index 000000000..77808547d
Binary files /dev/null and b/docs/probability_2/images/data.png differ
diff --git a/docs/probability_2/images/decomposition.png b/docs/probability_2/images/decomposition.png
new file mode 100644
index 000000000..21ad6054f
Binary files /dev/null and b/docs/probability_2/images/decomposition.png differ
diff --git a/docs/probability_2/images/error.png b/docs/probability_2/images/error.png
new file mode 100644
index 000000000..7441a3179
Binary files /dev/null and b/docs/probability_2/images/error.png differ
diff --git a/docs/probability_2/images/errors.png b/docs/probability_2/images/errors.png
new file mode 100644
index 000000000..0929d47d9
Binary files /dev/null and b/docs/probability_2/images/errors.png differ
diff --git a/docs/probability_2/images/y_hat.png b/docs/probability_2/images/y_hat.png
new file mode 100644
index 000000000..fe953ddc7
Binary files /dev/null and b/docs/probability_2/images/y_hat.png differ
diff --git a/docs/probability_2/images/y_hat2.png b/docs/probability_2/images/y_hat2.png
new file mode 100644
index 000000000..3b9b8e263
Binary files /dev/null and b/docs/probability_2/images/y_hat2.png differ
diff --git a/docs/probability_2/probability_2.html b/docs/probability_2/probability_2.html
new file mode 100644
index 000000000..c7a284bd9
--- /dev/null
+++ b/docs/probability_2/probability_2.html
@@ -0,0 +1,1892 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>18&nbsp; Estimators, Bias, and Variance – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../inference_causality/inference_causality.html" rel="next">
+<link href="../probability_1/probability_1.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../probability_2/probability_2.html"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Estimators, Bias, and Variance</h2>
+   
+  <ul>
+  <li><a href="#common-random-variables" id="toc-common-random-variables" class="nav-link active" data-scroll-target="#common-random-variables"><span class="header-section-number">18.1</span> Common Random Variables</a>
+  <ul>
+  <li><a href="#example" id="toc-example" class="nav-link" data-scroll-target="#example"><span class="header-section-number">18.1.1</span> Example</a></li>
+  </ul></li>
+  <li><a href="#sample-statistics" id="toc-sample-statistics" class="nav-link" data-scroll-target="#sample-statistics"><span class="header-section-number">18.2</span> Sample Statistics</a>
+  <ul>
+  <li><a href="#sample-mean" id="toc-sample-mean" class="nav-link" data-scroll-target="#sample-mean"><span class="header-section-number">18.2.1</span> Sample Mean</a></li>
+  <li><a href="#central-limit-theorem" id="toc-central-limit-theorem" class="nav-link" data-scroll-target="#central-limit-theorem"><span class="header-section-number">18.2.2</span> Central Limit Theorem</a></li>
+  <li><a href="#using-the-sample-mean-to-estimate-the-population-mean" id="toc-using-the-sample-mean-to-estimate-the-population-mean" class="nav-link" data-scroll-target="#using-the-sample-mean-to-estimate-the-population-mean"><span class="header-section-number">18.2.3</span> Using the Sample Mean to Estimate the Population Mean</a></li>
+  </ul></li>
+  <li><a href="#prediction-and-inference" id="toc-prediction-and-inference" class="nav-link" data-scroll-target="#prediction-and-inference"><span class="header-section-number">18.3</span> Prediction and Inference</a>
+  <ul>
+  <li><a href="#prediction-as-estimation" id="toc-prediction-as-estimation" class="nav-link" data-scroll-target="#prediction-as-estimation"><span class="header-section-number">18.3.1</span> Prediction as Estimation</a>
+  <ul>
+  <li><a href="#estimating-a-linear-relationship" id="toc-estimating-a-linear-relationship" class="nav-link" data-scroll-target="#estimating-a-linear-relationship"><span class="header-section-number">18.3.1.1</span> Estimating a Linear Relationship</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#sec-bias-variance-tradeoff" id="toc-sec-bias-variance-tradeoff" class="nav-link" data-scroll-target="#sec-bias-variance-tradeoff"><span class="header-section-number">18.4</span> Bias-Variance Tradeoff</a>
+  <ul>
+  <li><a href="#model-risk" id="toc-model-risk" class="nav-link" data-scroll-target="#model-risk"><span class="header-section-number">18.4.1</span> Model Risk</a>
+  <ul>
+  <li><a href="#observation-variance" id="toc-observation-variance" class="nav-link" data-scroll-target="#observation-variance"><span class="header-section-number">18.4.1.1</span> Observation Variance</a></li>
+  <li><a href="#model-variance" id="toc-model-variance" class="nav-link" data-scroll-target="#model-variance"><span class="header-section-number">18.4.1.2</span> Model Variance</a></li>
+  <li><a href="#model-bias" id="toc-model-bias" class="nav-link" data-scroll-target="#model-bias"><span class="header-section-number">18.4.1.3</span> Model Bias</a></li>
+  </ul></li>
+  <li><a href="#the-decomposition" id="toc-the-decomposition" class="nav-link" data-scroll-target="#the-decomposition"><span class="header-section-number">18.4.2</span> The Decomposition</a></li>
+  </ul></li>
+  <li><a href="#bonus-proof-of-bias-variance-decomposition" id="toc-bonus-proof-of-bias-variance-decomposition" class="nav-link" data-scroll-target="#bonus-proof-of-bias-variance-decomposition"><span class="header-section-number">18.5</span> [Bonus] Proof of Bias-Variance Decomposition</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></h1><button type="button" class="btn code-tools-button" id="quarto-code-tools-source"><i class="bi"></i> Code</button></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Explore commonly seen random variables like Bernoulli and Binomial distributions</li>
+<li>Apply the Central Limit Theorem to approximate parameters of a population</li>
+<li>Use sampled data to model an estimation of and infer the true underlying distribution</li>
+<li>Estimate the true population distribution from a sample using the bootstrapping technique</li>
+</ul>
+</div>
+</div>
+</div>
+<p>Last time, we introduced the idea of random variables: numerical functions of a sample. Most of our work in the last lecture was done to build a background in probability and statistics. Now that we’ve established some key ideas, we’re in a good place to apply what we’ve learned to our original goal – understanding how the randomness of a sample impacts the model design process.</p>
+<p>In this lecture, we will delve more deeply into the idea of fitting a model to a sample. We’ll explore how to re-express our modeling process in terms of random variables and use this new understanding to steer model complexity.</p>
+<section id="common-random-variables" class="level2" data-number="18.1">
+<h2 data-number="18.1" class="anchored" data-anchor-id="common-random-variables"><span class="header-section-number">18.1</span> Common Random Variables</h2>
+<p>There are several cases of random variables that appear often and have useful properties. Below are the ones we will explore further in this course. The numbers in parentheses are the parameters of a random variable, which are constants. Parameters define a random variable’s shape (i.e., distribution) and its values. For this lecture, we’ll focus more heavily on the bolded random variables and their special properties, but you should familiarize yourself with all the ones listed below:</p>
+<ul>
+<li><strong>Bernoulli(<span class="math inline">\(p\)</span>)</strong>
+<ul>
+<li>Takes on value 1 with probability <span class="math inline">\(p\)</span>, and 0 with probability <span class="math inline">\((1 - p)\)</span>.</li>
+<li>AKA the “indicator” random variable.</li>
+<li>Let <span class="math inline">\(X\)</span> be a Bernoulli(<span class="math inline">\(p\)</span>) random variable.
+<ul>
+<li><span class="math inline">\(\mathbb{E}[X] = 1 * p + 0 * (1-p) = p\)</span>
+<ul>
+<li><span class="math inline">\(\mathbb{E}[X^2] = 1^2 * p + 0 * (1-p) = p\)</span></li>
+</ul></li>
+<li><span class="math inline">\(\text{Var}(X) = \mathbb{E}[X^2] - (\mathbb{E}[X])^2 = p - p^2 = p(1-p)\)</span></li>
+</ul></li>
+</ul></li>
+<li><strong>Binomial(<span class="math inline">\(n\)</span>, <span class="math inline">\(p\)</span>)</strong>
+<ul>
+<li>Number of 1s in <span class="math inline">\(n\)</span> independent Bernoulli(<span class="math inline">\(p\)</span>) trials.</li>
+<li>Let <span class="math inline">\(Y\)</span> be a Binomial(<span class="math inline">\(n\)</span>, <span class="math inline">\(p\)</span>) random variable.
+<ul>
+<li>The distribution of <span class="math inline">\(Y\)</span> is given by the binomial formula, and we can write <span class="math inline">\(Y = \sum_{i=1}^n X_i\)</span> where:
+<ul>
+<li><span class="math inline">\(X_i\)</span> s the indicator of success on trial i. <span class="math inline">\(X_i = 1\)</span> if trial i is a success, else 0.</li>
+<li>All <span class="math inline">\(X_i\)</span> are i.i.d. and Bernoulli(<span class="math inline">\(p\)</span>).</li>
+</ul></li>
+<li><span class="math inline">\(\mathbb{E}[Y] = \sum_{i=1}^n \mathbb{E}[X_i] = np\)</span></li>
+<li><span class="math inline">\(\text{Var}(X) = \sum_{i=1}^n \text{Var}(X_i) = np(1-p)\)</span>
+<ul>
+<li><span class="math inline">\(X_i\)</span>’s are independent, so <span class="math inline">\(\text{Cov}(X_i, X_j) = 0\)</span> for all i, j.</li>
+</ul></li>
+</ul></li>
+</ul></li>
+<li>Uniform on a finite set of values
+<ul>
+<li>The probability of each value is <span class="math inline">\(\frac{1}{\text{(number of possible values)}}\)</span>.</li>
+<li>For example, a standard/fair die.</li>
+</ul></li>
+<li>Uniform on the unit interval (0, 1)
+<ul>
+<li>Density is flat at 1 on (0, 1) and 0 elsewhere.</li>
+</ul></li>
+<li>Normal(<span class="math inline">\(\mu, \sigma^2\)</span>), a.k.a Gaussian
+<ul>
+<li><span class="math inline">\(f(x) = \frac{1}{\sigma\sqrt{2\pi}} \exp\left( -\frac{1}{2}\left(\frac{x-\mu}{\sigma}\right)^{\!2}\,\right)\)</span></li>
+</ul></li>
+</ul>
+<section id="example" class="level3" data-number="18.1.1">
+<h3 data-number="18.1.1" class="anchored" data-anchor-id="example"><span class="header-section-number">18.1.1</span> Example</h3>
+<p>Suppose you win cash based on the number of heads you get in a series of 20 coin flips. Let <span class="math inline">\(X_i = 1\)</span> if the <span class="math inline">\(i\)</span>-th coin is heads, <span class="math inline">\(0\)</span> otherwise. Which payout strategy would you choose?</p>
+<p>A. <span class="math inline">\(Y_A = 10 * X_1 + 10 * X_2\)</span></p>
+<p>B. <span class="math inline">\(Y_B = \sum_{i=1}^{20} X_i\)</span></p>
+<p>C. <span class="math inline">\(Y_C = 20 * X_1\)</span></p>
+<div class="callout callout-style-default callout-caution no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-2-contents" aria-controls="callout-2" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Solution
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-2" class="callout-2-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p>Let <span class="math inline">\(X_1, X_2, ... X_{20}\)</span> be 20 i.i.d Bernoulli(0.5) random variables. Since the <span class="math inline">\(X_i\)</span>’s are independent, <span class="math inline">\(\text{Cov}(X_i, X_j) = 0\)</span> for all pairs <span class="math inline">\(i, j\)</span>. Additionally, Since <span class="math inline">\(X_i\)</span> is Bernoulli(0.5), we know that <span class="math inline">\(\mathbb{E}[X] = p = 0.5\)</span> and <span class="math inline">\(\text{Var}(X) = p(1-p) = 0.25\)</span>. We can calculate the following for each scenario:</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 25%">
+<col style="width: 25%">
+<col style="width: 25%">
+<col style="width: 25%">
+</colgroup>
+<thead>
+<tr class="header">
+<th></th>
+<th>A. <span class="math inline">\(Y_A = 10 * X_1 + 10 * X_2\)</span></th>
+<th>B. <span class="math inline">\(Y_B = \sum_{i=1}^{20} X_i\)</span></th>
+<th>C. <span class="math inline">\(Y_C = 20 * X_1\)</span></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>Expectation</td>
+<td><span class="math inline">\(\mathbb{E}[Y_A] = 10 (0.5) + 10(0.5) = 10\)</span></td>
+<td><span class="math inline">\(\mathbb{E}[Y_B] = 0.5 + ... + 0.5 = 10\)</span></td>
+<td><span class="math inline">\(\mathbb{E}[Y_C] = 20(0.5) = 10\)</span></td>
+</tr>
+<tr class="even">
+<td>Variance</td>
+<td><span class="math inline">\(\text{Var}(Y_A) = 10^2 (0.25) + 10^2 (0.25) = 50\)</span></td>
+<td><span class="math inline">\(\text{Var}(Y_B) = 0.25 + ... + 0.25 = 5\)</span></td>
+<td><span class="math inline">\(\text{Var}(Y_C) = 20^2 (0.25) = 100\)</span></td>
+</tr>
+<tr class="odd">
+<td>Standard Deviation</td>
+<td><span class="math inline">\(\text{SD}(Y_A) \approx 7.07\)</span></td>
+<td><span class="math inline">\(\text{SD}(Y_B) \approx 2.24\)</span></td>
+<td><span class="math inline">\(\text{SD}(Y_C) = 10\)</span></td>
+</tr>
+</tbody>
+</table>
+<p>As we can see, all the scenarios have the same expected value but different variances. The higher the variance, the greater the risk and uncertainty, so the “right” strategy depends on your personal preference. Would you choose the “safest” option B, the most “risky” option C, or somewhere in the middle (option A)?</p>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="sample-statistics" class="level2" data-number="18.2">
+<h2 data-number="18.2" class="anchored" data-anchor-id="sample-statistics"><span class="header-section-number">18.2</span> Sample Statistics</h2>
+<p>Today, we’ve talked extensively about populations; if we know the distribution of a random variable, we can reliably compute expectation, variance, functions of the random variable, etc. Note that:</p>
+<ul>
+<li>The distribution of a <em>population</em> describes how a random variable behaves across <em>all</em> individuals of interest.</li>
+<li>The distribution of a <em>sample</em> describes how a random variable behaves in a <em>specific sample</em> from the population.</li>
+</ul>
+<p>In Data Science, however, we often do not have access to the whole population, so we don’t know its distribution. As such, we need to collect a sample and use its distribution to estimate or infer properties of the population. In cases like these, we can take several samples of size <span class="math inline">\(n\)</span> from the population (an easy way to do this is using <code>df.sample(n, replace=True)</code>), and compute the mean of each <em>sample</em>. When sampling, we make the (big) assumption that we sample uniformly at random <em>with replacement</em> from the population; each observation in our sample is a random variable drawn i.i.d from our population distribution. Remember that our sample mean is a random variable since it depends on our randomly drawn sample! On the other hand, our population mean is simply a number (a fixed value).</p>
+<section id="sample-mean" class="level3" data-number="18.2.1">
+<h3 data-number="18.2.1" class="anchored" data-anchor-id="sample-mean"><span class="header-section-number">18.2.1</span> Sample Mean</h3>
+<p>Consider an i.i.d. sample <span class="math inline">\(X_1, X_2, ..., X_n\)</span> drawn from a population with mean 𝜇 and SD 𝜎. We define the sample mean as <span class="math display">\[\bar{X}_n = \frac{1}{n} \sum_{i=1}^n X_i\]</span></p>
+<p>The expectation of the sample mean is given by: <span class="math display">\[\begin{align}
+    \mathbb{E}[\bar{X}_n] &amp;= \frac{1}{n} \sum_{i=1}^n \mathbb{E}[X_i] \\
+    &amp;= \frac{1}{n} (n \mu) \\
+    &amp;= \mu
+\end{align}\]</span></p>
+<p>The variance is given by: <span class="math display">\[\begin{align}
+    \text{Var}(\bar{X}_n) &amp;= \frac{1}{n^2} \text{Var}( \sum_{i=1}^n X_i) \\
+    &amp;=  \frac{1}{n^2} \left( \sum_{i=1}^n \text{Var}(X_i) \right) \\
+    &amp;=  \frac{1}{n^2} (n \sigma^2) = \frac{\sigma^2}{n}
+\end{align}\]</span></p>
+<p><span class="math inline">\(\bar{X}_n\)</span> is approximately normally distributed by the Central Limit Theorem (CLT).</p>
+</section>
+<section id="central-limit-theorem" class="level3" data-number="18.2.2">
+<h3 data-number="18.2.2" class="anchored" data-anchor-id="central-limit-theorem"><span class="header-section-number">18.2.2</span> Central Limit Theorem</h3>
+<p>In <a href="https://inferentialthinking.com/chapters/14/4/Central_Limit_Theorem.html?">Data 8</a> and in the previous lecture, you encountered the <strong>Central Limit Theorem (CLT)</strong>. This is a powerful theorem for estimating the distribution of a population with mean <span class="math inline">\(\mu\)</span> and standard deviation <span class="math inline">\(\sigma\)</span> from a collection of smaller samples. The CLT tells us that if an i.i.d sample of size <span class="math inline">\(n\)</span> is large, then the probability distribution of the <strong>sample mean</strong> is <strong>roughly normal</strong> with mean <span class="math inline">\(\mu\)</span> and SD of <span class="math inline">\(\frac{\sigma}{\sqrt{n}}\)</span>. More generally, any theorem that provides the rough distribution of a statistic and <strong>doesn’t need the distribution of the population</strong> is valuable to data scientists! This is because we rarely know a lot about the population.</p>
+<p align="center">
+</p><p><img src="images/clt.png" alt="clt" width="400"></p>
+<p></p>
+<p>Importantly, the CLT assumes that each observation in our samples is drawn i.i.d from the distribution of the population. In addition, the CLT is accurate only when <span class="math inline">\(n\)</span> is “large”, but what counts as a “large” sample size depends on the specific distribution. If a population is highly symmetric and unimodal, we could need as few as <span class="math inline">\(n=20\)</span>; if a population is very skewed, we need a larger <span class="math inline">\(n\)</span>. If in doubt, you can bootstrap the sample mean and see if the bootstrapped distribution is bell-shaped. Classes like Data 140 investigate this idea in great detail. <!-- The CLT states that no matter what population you are drawing from, if an i.i.d. sample of size $n$ is large, the probability distribution of the sample mean is roughly normal with mean 𝜇 and SD $\sigma/\sqrt{n}$. --></p>
+<p>For a more in-depth demo, check out <a href="https://onlinestatbook.com/stat_sim/sampling_dist/">onlinestatbook</a>.</p>
+</section>
+<section id="using-the-sample-mean-to-estimate-the-population-mean" class="level3" data-number="18.2.3">
+<h3 data-number="18.2.3" class="anchored" data-anchor-id="using-the-sample-mean-to-estimate-the-population-mean"><span class="header-section-number">18.2.3</span> Using the Sample Mean to Estimate the Population Mean</h3>
+<p>Now let’s say we want to use the sample mean to <strong>estimate</strong> the population mean, for example, the average height of Cal undergraduates. We can typically collect a <strong>single sample</strong>, which has just one average. However, what if we happened, by random chance, to draw a sample with a different mean or spread than that of the population? We might get a skewed view of how the population behaves (consider the extreme case where we happen to sample the exact same value <span class="math inline">\(n\)</span> times!).</p>
+<p align="center">
+<img src="images/CLTdiff.png" alt="clt" width="400">
+</p>
+<p>For example, notice the difference in variation between these two distributions that are different in sample size. The distribution with a bigger sample size (<span class="math inline">\(n=800\)</span>) is tighter around the mean than the distribution with a smaller sample size (<span class="math inline">\(n=200\)</span>). Try plugging in these values into the standard deviation equation for the sample mean to make sense of this!</p>
+<p>Applying the CLT allows us to make sense of all of this and resolve this issue. By drawing many samples, we can consider how the sample distribution varies across multiple subsets of the data. This allows us to approximate the properties of the population without the need to survey every single member.</p>
+<p>Given this potential variance, it is also important that we consider the <strong>average value and spread</strong> of all possible sample means, and what this means for how big <span class="math inline">\(n\)</span> should be. For every sample size, the expected value of the sample mean is the population mean: <span class="math display">\[\mathbb{E}[\bar{X}_n] = \mu\]</span> We call the sample mean an <strong>unbiased estimator</strong> of the population mean and will explore this idea more in the next lecture.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-3-contents" aria-controls="callout-3" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Data 8 Recap: Square Root Law
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-3" class="callout-3-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<p>The square root law (<a href="https://inferentialthinking.com/chapters/14/5/Variability_of_the_Sample_Mean.html#the-square-root-law">Data 8</a>) states that if you increase the sample size by a factor, the SD of the sample mean decreases by the square root of the factor. <span class="math display">\[\text{SD}(\bar{X_n}) = \frac{\sigma}{\sqrt{n}}\]</span> The sample mean is more likely to be close to the population mean if we have a larger sample size.</p>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="prediction-and-inference" class="level2" data-number="18.3">
+<h2 data-number="18.3" class="anchored" data-anchor-id="prediction-and-inference"><span class="header-section-number">18.3</span> Prediction and Inference</h2>
+<p>At this point in the course, we’ve spent a great deal of time working with models. When we first introduced the idea of modeling a few weeks ago, we did so in the context of <strong>prediction</strong>: using models to make <em>accurate predictions</em> about unseen data. Another reason we might build models is to better understand complex phenomena in the world around us. <strong>Inference</strong> is the task of using a model to infer the true underlying relationships between the feature and response variables. For example, if we are working with a set of housing data, <em>prediction</em> might ask: given the attributes of a house, how much is it worth? <em>Inference</em> might ask: how much does having a local park impact the value of a house?</p>
+<p>A major goal of inference is to draw conclusions about the full population of data given only a random sample. To do this, we aim to estimate the value of a <strong>parameter</strong>, which is a numerical function of the <em>population</em> (for example, the population mean <span class="math inline">\(\mu\)</span>). We use a collected sample to construct a <strong>statistic</strong>, which is a numerical function of the random <em>sample</em> (for example, the sample mean <span class="math inline">\(\bar{X}_n\)</span>). It’s helpful to think “p” for “parameter” and “population,” and “s” for “sample” and “statistic.”</p>
+<p>Since the sample represents a <em>random</em> subset of the population, any statistic we generate will likely deviate from the true population parameter, and it <em>could have been different</em>. We say that the sample statistic is an <strong>estimator</strong> of the true population parameter. Notationally, the population parameter is typically called <span class="math inline">\(\theta\)</span>, while its estimator is denoted by <span class="math inline">\(\hat{\theta}\)</span>.</p>
+<p>To address our inference question, we aim to construct estimators that closely estimate the value of the population parameter. We evaluate how “good” an estimator is by answering three questions:</p>
+<ul>
+<li>How close is our answer to the parameter? <strong>(Risk / MSE)</strong> <span class="math display">\[ MSE(\hat{\theta}) = E[(\hat{\theta} - \theta)]^2\]</span></li>
+<li>Do we get the right answer for the parameter, on average? <strong>(Bias)</strong> <span class="math display">\[\text{Bias}(\hat{\theta}) = E[\hat{\theta} - \theta] = E[\hat{\theta}] - \theta\]</span></li>
+<li>How variable is the answer? <strong>(Variance)</strong> <span class="math display">\[Var(\hat{\theta}) = E[(\theta - E[\theta])^2] \]</span></li>
+</ul>
+<p>This relationship can be illustrated with an archery analogy. Imagine that the center of the target is the <span class="math inline">\(\theta\)</span> and each arrow corresponds to a separate parameter estimate <span class="math inline">\(\hat{\theta}\)</span></p>
+<p align="center">
+<img src="images/bias_v_variance.png" width="600">
+</p>
+<p>Ideally, we want our estimator to have low bias and low variance, but how can we mathematically quantify that? See <a href="#sec-bias-variance-tradeoff" class="quarto-xref"><span class="quarto-unresolved-ref">sec-bias-variance-tradeoff</span></a> for more detail.</p>
+<section id="prediction-as-estimation" class="level3" data-number="18.3.1">
+<h3 data-number="18.3.1" class="anchored" data-anchor-id="prediction-as-estimation"><span class="header-section-number">18.3.1</span> Prediction as Estimation</h3>
+<p>Now that we’ve established the idea of an estimator, let’s see how we can apply this learning to the modeling process. To do so, we’ll take a moment to formalize our data collection and models in the language of random variables.</p>
+<p>Say we are working with an input variable, <span class="math inline">\(x\)</span>, and a response variable, <span class="math inline">\(Y\)</span>. We assume that <span class="math inline">\(Y\)</span> and <span class="math inline">\(x\)</span> are linked by some relationship <span class="math inline">\(g\)</span>; in other words, <span class="math inline">\(Y = g(x)\)</span> where <span class="math inline">\(g\)</span> represents some “universal truth” or “law of nature” that defines the underlying relationship between <span class="math inline">\(x\)</span> and <span class="math inline">\(Y\)</span>. In the image below, <span class="math inline">\(g\)</span> is denoted by the red line.</p>
+<p>As data scientists, however, we have no way of directly “seeing” the underlying relationship <span class="math inline">\(g\)</span>. The best we can do is collect observed data out in the real world to try to understand this relationship. Unfortunately, the data collection process will always have some inherent error (think of the randomness you might encounter when taking measurements in a scientific experiment). We say that each observation comes with some random error or <strong>noise</strong> term, <span class="math inline">\(\epsilon\)</span> (read: “epsilon”). This error is assumed to be a random variable with expectation <span class="math inline">\(\mathbb{E}(\epsilon)=0\)</span>, variance <span class="math inline">\(\text{Var}(\epsilon) = \sigma^2\)</span>, and be i.i.d. across each observation. The existence of this random noise means that our observations, <span class="math inline">\(Y(x)\)</span>, are <em>random variables</em>.</p>
+<p align="center">
+<img src="images/data.png" alt="data" width="700">
+</p>
+<p>We can only observe our random sample of data, represented by the blue points above. From this sample, we want to estimate the true relationship <span class="math inline">\(g\)</span>. We do this by constructing the model <span class="math inline">\(\hat{Y}(x)\)</span> to estimate <span class="math inline">\(g\)</span>.</p>
+<p><span class="math display">\[\text{True relationship: } g(x)\]</span></p>
+<p><span class="math display">\[\text{Observed relationship: }Y = g(x) + \epsilon\]</span></p>
+<p><span class="math display">\[\text{Prediction: }\hat{Y}(x)\]</span></p>
+<p align="center">
+<img src="images/y_hat.png" alt="y_hat" width="600">
+</p>
+<p>When building models, it is also important to note that our choice of features will also significantly impact our estimation. In the plot below, you can see how the different models (green and purple) can lead to different estimates.</p>
+<p align="center">
+<img src="images/y_hat2.png" alt="y_hat" width="400">
+</p>
+<section id="estimating-a-linear-relationship" class="level4" data-number="18.3.1.1">
+<h4 data-number="18.3.1.1" class="anchored" data-anchor-id="estimating-a-linear-relationship"><span class="header-section-number">18.3.1.1</span> Estimating a Linear Relationship</h4>
+<p>If we assume that the true relationship <span class="math inline">\(g\)</span> is linear, we can express the response as <span class="math inline">\(Y = f_{\theta}(x)\)</span>, where our true relationship is modeled by <span class="math display">\[Y = g(x) + \epsilon\]</span> <span class="math display">\[ f_{\theta}(x) = Y = \theta_0 + \sum_{j=1}^p \theta_j x_j + \epsilon\]</span></p>
+<div class="callout callout-style-default callout-warning no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-4-contents" aria-controls="callout-4" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Which expressions are random?
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-4" class="callout-4-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p>In our two equations above, the true relationship <span class="math inline">\(g(x) = \theta_0 + \sum_{j=1}^p \theta_j x_j\)</span> is not random, but <span class="math inline">\(\epsilon\)</span> is random. Hence, <span class="math inline">\(Y = f_{\theta}(x)\)</span> is also random.</p>
+</div>
+</div>
+</div>
+<p>This true relationship has true, unobservable parameters <span class="math inline">\(\theta\)</span>, and it has random noise <span class="math inline">\(\epsilon\)</span>, so we can never observe the true relationship. Instead, the next best thing we can do is obtain a sample <span class="math inline">\(\Bbb{X}\)</span>, <span class="math inline">\(\Bbb{Y}\)</span> of <span class="math inline">\(n\)</span> observed relationships, <span class="math inline">\((x, Y)\)</span> and use it to train a model and obtain an estimate of <span class="math inline">\(\hat{\theta}\)</span> <span class="math display">\[\hat{Y}(x) = f_{\hat{\theta}}(x) = \hat{\theta_0} + \sum_{j=1}^p \hat{\theta_j} x_j\]</span></p>
+<div class="callout callout-style-default callout-warning no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-5-contents" aria-controls="callout-5" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Which expressions are random?
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-5" class="callout-5-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p>In our estimating equation above, our sample <span class="math inline">\(\Bbb{X}\)</span>, <span class="math inline">\(\Bbb{Y}\)</span> are random (often due to human error). Hence, the estimates we calculate from our samples <span class="math inline">\(\hat{\theta}\)</span> are also random, so our predictor <span class="math inline">\(\hat{Y}(x)\)</span> is also random.</p>
+</div>
+</div>
+</div>
+<p>Now taking a look at our original equations, we can see that they both have differing sources of randomness. For our observed relationship, <span class="math inline">\(Y = g(x) + \epsilon\)</span>, <span class="math inline">\(\epsilon\)</span> represents errors which occur during or after the observation or measurement process. For the estimation model, the data we have is a random sample collected from the population, which was constructed from decisions made before the measurement process.</p>
+</section>
+</section>
+</section>
+<section id="sec-bias-variance-tradeoff" class="level2" data-number="18.4">
+<h2 data-number="18.4" class="anchored" data-anchor-id="sec-bias-variance-tradeoff"><span class="header-section-number">18.4</span> Bias-Variance Tradeoff</h2>
+<p>Recall the model and the data we generated from that model in the last section:</p>
+<p><span class="math display">\[\text{True relationship: } g(x)\]</span></p>
+<p><span class="math display">\[\text{Observed relationship: }Y = g(x) + \epsilon\]</span></p>
+<p><span class="math display">\[\text{Prediction: }\hat{Y}(x)\]</span></p>
+<p>With this reformulated modeling goal, we can now revisit the Bias-Variance Tradeoff from two lectures ago (shown below):</p>
+<p align="center">
+<img src="images/bvt_old.png" width="600">
+</p>
+<p>In today’s lecture, we’ll explore a more mathematical version of the graph you see above by introducing the terms model risk, observation variance, model bias, and model variance. Eventually, we’ll work our way up to an updated version of the Bias-Variance Tradeoff graph that you see below</p>
+<p align="center">
+<img src="images/bvt.png" width="600">
+</p>
+<section id="model-risk" class="level3" data-number="18.4.1">
+<h3 data-number="18.4.1" class="anchored" data-anchor-id="model-risk"><span class="header-section-number">18.4.1</span> Model Risk</h3>
+<p><strong>Model risk</strong> is defined as the mean square prediction error of the random variable <span class="math inline">\(\hat{Y}\)</span>. It is an expectation across <em>all</em> samples we could have possibly gotten when fitting the model, which we can denote as random variables <span class="math inline">\(X_1, X_2, \ldots, X_n, Y\)</span>. Model risk considers the model’s performance on any sample that is theoretically possible, rather than the specific data that we have collected.</p>
+<p><span class="math display">\[\text{model risk }=E\left[(Y-\hat{Y(x)})^2\right]\]</span></p>
+<p>What is the origin of the error encoded by model risk? Note that there are two types of errors:</p>
+<ul>
+<li>Chance errors: happen due to randomness alone
+<ul>
+<li>Source 1 <strong>(Observation Variance)</strong>: randomness in new observations <span class="math inline">\(Y\)</span> due to random noise <span class="math inline">\(\epsilon\)</span></li>
+<li>Source 2 <strong>(Model Variance)</strong>: randomness in the sample we used to train the models, as samples <span class="math inline">\(X_1, X_2, \ldots, X_n, Y\)</span> are random</li>
+</ul></li>
+<li><strong>(Model Bias)</strong>: non-random error due to our model being different from the true underlying function <span class="math inline">\(g\)</span></li>
+</ul>
+<p>Recall the data-generating process we established earlier. There is a true underlying relationship <span class="math inline">\(g\)</span>, observed data (with random noise) <span class="math inline">\(Y\)</span>, and model <span class="math inline">\(\hat{Y}\)</span>.</p>
+<p align="center">
+<img src="images/errors.png" alt="errors" width="600">
+</p>
+<p>To better understand model risk, we’ll zoom in on a single data point in the plot above.</p>
+<p align="center">
+<img src="images/breakdown.png" alt="breakdown" width="600">
+</p>
+<p>Remember that <span class="math inline">\(\hat{Y}(x)\)</span> is a random variable – it is the prediction made for <span class="math inline">\(x\)</span> after being fit on the specific sample used for training. If we had used a different sample for training, a different prediction might have been made for this value of <span class="math inline">\(x\)</span>. To capture this, the diagram above considers both the prediction <span class="math inline">\(\hat{Y}(x)\)</span> made for a particular random training sample, and the <em>expected</em> prediction across all possible training samples, <span class="math inline">\(E[\hat{Y}(x)]\)</span>.</p>
+<p>We can use this simplified diagram to break down the prediction error into smaller components. First, start by considering the error on a single prediction, <span class="math inline">\(Y(x)-\hat{Y}(x)\)</span>.</p>
+<p align="center">
+<img src="images/error.png" alt="error" width="600">
+</p>
+<p>We can identify three components of this error.</p>
+<p align="center">
+<img src="images/decomposition.png" alt="decomposition" width="600">
+</p>
+<p>That is, the error can be written as:</p>
+<p><span class="math display">\[Y(x)-\hat{Y}(x) = \epsilon + \left(g(x)-E\left[\hat{Y}(x)\right]\right) + \left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)\]</span> <span class="math display">\[\newline   \]</span></p>
+<p>The model risk is the expected square of the expression above, <span class="math inline">\(E\left[(Y(x)-\hat{Y}(x))^2\right]\)</span>. If we square both sides and then take the expectation, we will get the following decomposition of model risk:</p>
+<p><span class="math display">\[E\left[(Y(x)-\hat{Y}(x))^2\right] = E[\epsilon^2] + \left(g(x)-E\left[\hat{Y}(x)\right]\right)^2 + E\left[\left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)^2\right]\]</span></p>
+<p>It looks like we are missing some cross-product terms when squaring the right-hand side, but it turns out that all of those cross-product terms are zero. The detailed derivation is out of scope for this class, but a proof is included at the end of this note for your reference.</p>
+<p>This expression may look complicated at first glance, but we’ve actually already defined each term earlier in this lecture! Let’s look at them term by term.</p>
+<section id="observation-variance" class="level4" data-number="18.4.1.1">
+<h4 data-number="18.4.1.1" class="anchored" data-anchor-id="observation-variance"><span class="header-section-number">18.4.1.1</span> Observation Variance</h4>
+<p>The first term in the above decomposition is <span class="math inline">\(E[\epsilon^2]\)</span>. Remember <span class="math inline">\(\epsilon\)</span> is the random noise when observing <span class="math inline">\(Y\)</span>, with expectation <span class="math inline">\(\mathbb{E}(\epsilon)=0\)</span> and variance <span class="math inline">\(\text{Var}(\epsilon) = \sigma^2\)</span>. We can show that <span class="math inline">\(E[\epsilon^2]\)</span> is the variance of <span class="math inline">\(\epsilon\)</span>: <span class="math display">\[
+\begin{align*}
+\text{Var}(\epsilon) &amp;= E[\epsilon^2] + \left(E[\epsilon]\right)^2\\
+&amp;= E[\epsilon^2] + 0^2\\
+&amp;= \sigma^2.
+\end{align*}
+\]</span></p>
+<p>This term describes how variable the random error <span class="math inline">\(\epsilon\)</span> (and <span class="math inline">\(Y\)</span>) is for each observation. This is called the <strong>observation variance</strong>. It exists due to the randomness in our observations <span class="math inline">\(Y\)</span>. It is a form of <em>chance error</em> we talked about in the Sampling lecture.</p>
+<p><span class="math display">\[\text{observation variance} = \text{Var}(\epsilon) = \sigma^2.\]</span></p>
+<p>The observation variance results from measurement errors when observing data or missing information that acts like noise. To reduce this observation variance, we could try to get more precise measurements, but it is often beyond the control of data scientists. Because of this, the observation variance <span class="math inline">\(\sigma^2\)</span> is sometimes called “irreducible error.”</p>
+</section>
+<section id="model-variance" class="level4" data-number="18.4.1.2">
+<h4 data-number="18.4.1.2" class="anchored" data-anchor-id="model-variance"><span class="header-section-number">18.4.1.2</span> Model Variance</h4>
+<p>We will then look at the last term: <span class="math inline">\(E\left[\left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)^2\right]\)</span>. If you recall the definition of variance from the last lecture, this is precisely <span class="math inline">\(\text{Var}(\hat{Y}(x))\)</span>. We call this the <strong>model variance</strong>.</p>
+<p>It describes how much the prediction <span class="math inline">\(\hat{Y}(x)\)</span> tends to vary when we fit the model on different samples. Remember the sample we collect can come out very differently, thus the prediction <span class="math inline">\(\hat{Y}(x)\)</span> will also be different. The model variance describes this variability due to the randomness in our sampling process. Like observation variance, it is also a form of <em>chance error</em>—even though the sources of randomness are different.</p>
+<p><span class="math display">\[\text{model variance} = \text{Var}(\hat{Y}(x)) = E\left[\left(\hat{Y}(x) - E\left[\hat{Y}(x)\right]\right)^2\right]\]</span></p>
+<p>The main reason for the large model variance is because of <strong>overfitting</strong>: we paid too much attention to the details in our sample that small differences in our random sample lead to large differences in the fitted model. To remediate this, we try to reduce model complexity (e.g.&nbsp;take out some features and limit the magnitude of estimated model coefficients) and not fit our model on the noises.</p>
+</section>
+<section id="model-bias" class="level4" data-number="18.4.1.3">
+<h4 data-number="18.4.1.3" class="anchored" data-anchor-id="model-bias"><span class="header-section-number">18.4.1.3</span> Model Bias</h4>
+<p>Finally, the second term is <span class="math inline">\(\left(g(x)-E\left[\hat{Y}(x)\right]\right)^2\)</span>. What is this? The term <span class="math inline">\(E\left[\hat{Y}(x)\right] - g(x)\)</span> is called the <strong>model bias</strong>.</p>
+<p>Remember that <span class="math inline">\(g(x)\)</span> is the fixed underlying truth and <span class="math inline">\(\hat{Y}(x)\)</span> is our fitted model, which is random. Model bias therefore measures how far off <span class="math inline">\(g(x)\)</span> and <span class="math inline">\(\hat{Y}(x)\)</span> are on average over all possible samples.</p>
+<p><span class="math display">\[\text{model bias} = E\left[\hat{Y}(x) - g(x)\right] = E\left[\hat{Y}(x)\right] - g(x)\]</span></p>
+<p>The model bias is not random; it’s an average measure for a specific individual <span class="math inline">\(x\)</span>. If bias is positive, our model tends to overestimate <span class="math inline">\(g(x)\)</span>; if it’s negative, our model tends to underestimate <span class="math inline">\(g(x)\)</span>. And if it’s 0, we can say that our model is <strong>unbiased</strong>.</p>
+<div class="callout callout-style-default callout-tip no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Unbiased Estimators
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>An <strong>unbiased model</strong> has a <span class="math inline">\(\text{model bias } = 0\)</span>. In other words, our model predicts <span class="math inline">\(g(x)\)</span> on average.</p>
+<p>Similarly, we can define bias for estimators like the mean. The sample mean is an <strong>unbiased estimator</strong> of the population mean, as by CLT, <span class="math inline">\(\mathbb{E}[\bar{X}_n] = \mu\)</span>. Therefore, the <span class="math inline">\(\text{estimator bias } = \mathbb{E}[\bar{X}_n] - \mu = 0\)</span>.</p>
+</div>
+</div>
+<p>There are two main reasons for large model biases:</p>
+<ul>
+<li>Underfitting: our model is too simple for the data</li>
+<li>Lack of domain knowledge: we don’t understand what features are useful for the response variable</li>
+</ul>
+<p>To fix this, we increase model complexity (but we don’t want to overfit!) or consult domain experts to see which models make sense. You can start to see a tradeoff here: if we increase model complexity, we decrease the model bias, but we also risk increasing the model variance.</p>
+</section>
+</section>
+<section id="the-decomposition" class="level3" data-number="18.4.2">
+<h3 data-number="18.4.2" class="anchored" data-anchor-id="the-decomposition"><span class="header-section-number">18.4.2</span> The Decomposition</h3>
+<p>To summarize:</p>
+<ul>
+<li>The <strong>model risk</strong>, <span class="math inline">\(\mathbb{E}\left[(Y(x)-\hat{Y}(x))^2\right]\)</span>, is the mean squared prediction error of the model. It is an expectation and is therefore a <strong>fixed number</strong> (for a given x).</li>
+<li>The <strong>observation variance</strong>, <span class="math inline">\(\sigma^2\)</span>, is the variance of the random noise in the observations. It describes how variable the random error <span class="math inline">\(\epsilon\)</span> is for each observation and <strong>cannot be addressed by modeling</strong>.</li>
+<li>The <strong>model bias</strong>, <span class="math inline">\(\mathbb{E}\left[\hat{Y}(x)\right]-g(x)\)</span>, is how “off” the <span class="math inline">\(\hat{Y}(x)\)</span> is as an estimator of the true underlying relationship <span class="math inline">\(g(x)\)</span>.</li>
+<li>The <strong>model variance</strong>, <span class="math inline">\(\text{Var}(\hat{Y}(x))\)</span>, describes how much the prediction <span class="math inline">\(\hat{Y}(x)\)</span> tends to vary when we fit the model on different samples.</li>
+</ul>
+<p>The above definitions enable us to simplify the decomposition of model risk before as:</p>
+<p><span class="math display">\[ E[(Y(x) - \hat{Y}(x))^2] = \sigma^2 + (E[\hat{Y}(x)] - g(x))^2 + \text{Var}(\hat{Y}(x)) \]</span> <span class="math display">\[\text{model risk } = \text{observation variance} + (\text{model bias})^2 \text{+ model variance}\]</span></p>
+<p>This is known as the <strong>bias-variance tradeoff</strong>. What does it mean? Remember that the model risk is a measure of the model’s performance. Our goal in building models is to keep model risk low; this means that we will want to ensure that each component of model risk is kept at a small value.</p>
+<p>Observation variance is an inherent, random part of the data collection process. We aren’t able to reduce the observation variance, so we’ll focus our attention on the model bias and model variance.</p>
+<p>In the Feature Engineering lecture, we considered the issue of overfitting. We saw that the model’s error or bias tends to decrease as model complexity increases — if we design a highly complex model, it will tend to make predictions that are closer to the true relationship <span class="math inline">\(g\)</span>. At the same time, model variance tends to <em>increase</em> as model complexity increases; a complex model may overfit to the training data, meaning that small differences in the random samples used for training lead to large differences in the fitted model. We have a problem. To decrease model bias, we could increase the model’s complexity, which would lead to overfitting and an increase in model variance. Alternatively, we could decrease model variance by decreasing the model’s complexity at the cost of increased model bias due to underfitting.</p>
+<p align="center">
+<img src="images/bvt.png" alt="bvt" width="600">
+</p>
+<p>We need to strike a balance. Our goal in model creation is to use a complexity level that is high enough to keep bias low, but not so high that model variance is large.</p>
+</section>
+</section>
+<section id="bonus-proof-of-bias-variance-decomposition" class="level2" data-number="18.5">
+<h2 data-number="18.5" class="anchored" data-anchor-id="bonus-proof-of-bias-variance-decomposition"><span class="header-section-number">18.5</span> [Bonus] Proof of Bias-Variance Decomposition</h2>
+<p>This section walks through the detailed derivation of the Bias-Variance Decomposition in the Bias-Variance Tradeoff section above, and this content is out of scope.</p>
+<div class="callout callout-style-simple callout-none no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-7-contents" aria-controls="callout-7" aria-expanded="false" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Click to show
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-7" class="callout-7-contents callout-collapse collapse">
+<div class="callout-body-container callout-body">
+<p>We want to prove that the model risk can be decomposed as</p>
+<p><span class="math display">\[
+\begin{align*}
+E\left[(Y(x)-\hat{Y}(x))^2\right] &amp;= E[\epsilon^2] + \left(g(x)-E\left[\hat{Y}(x)\right]\right)^2 + E\left[\left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)^2\right].
+\end{align*}
+\]</span></p>
+<p>To prove this, we will first need the following lemma:</p>
+<center>
+If <span class="math inline">\(V\)</span> and <span class="math inline">\(W\)</span> are independent random variables then <span class="math inline">\(E[VW] = E[V]E[W]\)</span>.
+</center>
+<p>We will prove this in the discrete finite case. Trust that it’s true in greater generality.</p>
+<p>The job is to calculate the weighted average of the values of <span class="math inline">\(VW\)</span>, where the weights are the probabilities of those values. Here goes.</p>
+<p><span class="math display">\[\begin{align*}
+E[VW] ~ &amp;= ~ \sum_v\sum_w vwP(V=v \text{ and } W=w) \\
+&amp;= ~ \sum_v\sum_w vwP(V=v)P(W=w) ~~~~ \text{by independence} \\
+&amp;= ~ \sum_v vP(V=v)\sum_w wP(W=w) \\
+&amp;= ~ E[V]E[W]
+\end{align*}\]</span></p>
+<p>Now we go into the actual proof:</p>
+<section id="goal" class="level3" data-number="18.5.1">
+<h3 data-number="18.5.1" class="anchored" data-anchor-id="goal"><span class="header-section-number">18.5.1</span> Goal</h3>
+<p>Decompose the model risk into recognizable components.</p>
+</section>
+<section id="step-1" class="level3" data-number="18.5.2">
+<h3 data-number="18.5.2" class="anchored" data-anchor-id="step-1"><span class="header-section-number">18.5.2</span> Step 1</h3>
+<p><span class="math display">\[
+\begin{align*}
+\text{model risk} ~ &amp;= ~ E\left[\left(Y - \hat{Y}(x)\right)^2 \right] \\
+&amp;= ~ E\left[\left(g(x) + \epsilon - \hat{Y}(x)\right)^2 \right] \\
+&amp;= ~ E\left[\left(\epsilon + \left(g(x)- \hat{Y}(x)\right)\right)^2 \right] \\
+&amp;= ~ E\left[\epsilon^2\right] + 2E\left[\epsilon \left(g(x)- \hat{Y}(x)\right)\right] + E\left[\left(g(x) - \hat{Y}(x)\right)^2\right]\\
+\end{align*}
+\]</span></p>
+<p>On the right hand side:</p>
+<ul>
+<li>The first term is the observation variance <span class="math inline">\(\sigma^2\)</span>.</li>
+<li>The cross product term is 0 because <span class="math inline">\(\epsilon\)</span> is independent of <span class="math inline">\(g(x) - \hat{Y}(x)\)</span> and <span class="math inline">\(E(\epsilon) = 0\)</span></li>
+<li>The last term is the mean squared difference between our predicted value and the value of the true function at <span class="math inline">\(x\)</span></li>
+</ul>
+</section>
+<section id="step-2" class="level3" data-number="18.5.3">
+<h3 data-number="18.5.3" class="anchored" data-anchor-id="step-2"><span class="header-section-number">18.5.3</span> Step 2</h3>
+<p>At this stage we have</p>
+<p><span class="math display">\[
+\text{model risk} ~ = ~ E\left[\epsilon^2\right] + E\left[\left(g(x) - \hat{Y}(x)\right)^2\right]
+\]</span></p>
+<p>We don’t yet have a good understanding of <span class="math inline">\(g(x) - \hat{Y}(x)\)</span>. But we do understand the deviation <span class="math inline">\(D_{\hat{Y}(x)} = \hat{Y}(x) - E\left[\hat{Y}(x)\right]\)</span>. We know that</p>
+<ul>
+<li><span class="math inline">\(E\left[D_{\hat{Y}(x)}\right] ~ = ~ 0\)</span></li>
+<li><span class="math inline">\(E\left[D_{\hat{Y}(x)}^2\right] ~ = ~ \text{model variance}\)</span></li>
+</ul>
+<p>So let’s add and subtract <span class="math inline">\(E\left[\hat{Y}(x)\right]\)</span> and see if that helps.</p>
+<p><span class="math display">\[
+g(x) - \hat{Y}(x) ~ = ~ \left(g(x) - E\left[\hat{Y}(x)\right] \right) + \left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)
+\]</span></p>
+<p>The first term on the right hand side is the model bias at <span class="math inline">\(x\)</span>. The second term is <span class="math inline">\(-D_{\hat{Y}(x)}\)</span>. So</p>
+<p><span class="math display">\[
+g(x) - \hat{Y}(x) ~ = ~ \text{model bias} - D_{\hat{Y}(x)}
+\]</span></p>
+</section>
+<section id="step-3" class="level3" data-number="18.5.4">
+<h3 data-number="18.5.4" class="anchored" data-anchor-id="step-3"><span class="header-section-number">18.5.4</span> Step 3</h3>
+<p>Remember that the model bias at <span class="math inline">\(x\)</span> is a constant, not a random variable. Think of it as your favorite number, say 10. Then <span class="math display">\[
+\begin{align*}
+E\left[ \left(g(x) - \hat{Y}(x)\right)^2 \right] ~ &amp;= ~ \text{model bias}^2 - 2(\text{model bias})E\left[D_{\hat{Y}(x)}\right] + E\left[D_{\hat{Y}(x)}^2\right] \\
+&amp;= ~ \text{model bias}^2 - 0 + \text{model variance} \\
+&amp;= ~ \text{model bias}^2 + \text{model variance}
+\end{align*}
+\]</span></p>
+<p>Again, the cross-product term is <span class="math inline">\(0\)</span> because <span class="math inline">\(E\left[D_{\hat{Y}(x)}\right] ~ = ~ 0\)</span>.</p>
+</section>
+<section id="step-4-bias-variance-decomposition" class="level3" data-number="18.5.5">
+<h3 data-number="18.5.5" class="anchored" data-anchor-id="step-4-bias-variance-decomposition"><span class="header-section-number">18.5.5</span> Step 4: Bias-Variance Decomposition</h3>
+<p>In Step 2, we had:</p>
+<p><span class="math display">\[
+\text{model risk} ~ = ~ \text{observation variance} + E\left[\left(g(x) - \hat{Y}(x)\right)^2\right]
+\]</span></p>
+<p>Step 3 showed:</p>
+<p><span class="math display">\[
+E\left[ \left(g(x) - \hat{Y}(x)\right)^2 \right] ~ = ~ \text{model bias}^2 + \text{model variance}
+\]</span></p>
+<p>Thus, we have proven the bias-variance decomposition:</p>
+<p><span class="math display">\[
+\text{model risk} = \text{observation variance} + \text{model bias}^2 + \text{model variance}.
+\]</span></p>
+<p>That is,</p>
+<p><span class="math display">\[
+E\left[(Y(x)-\hat{Y}(x))^2\right] = \sigma^2 + \left(E\left[\hat{Y}(x)\right] - g(x)\right)^2 + E\left[\left(\hat{Y}(x)-E\left[\hat{Y}(x)\right]\right)^2\right]
+\]</span></p>
+</section>
+</div>
+</div>
+</div>
+
+
+<!-- -->
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../probability_1/probability_1.html" class="pagination-link" aria-label="Random Variables">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../inference_causality/inference_causality.html" class="pagination-link" aria-label="Causal Inference and Confounding">
+        <span class="nav-page-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb1" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> 'Estimators, Bias, and Variance'</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: 'Estimators, Bias, and Variance'</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb1-24"><a href="#cb1-24" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb1-25"><a href="#cb1-25" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb1-26"><a href="#cb1-26" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb1-27"><a href="#cb1-27" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb1-28"><a href="#cb1-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-29"><a href="#cb1-29" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb1-30"><a href="#cb1-30" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb1-31"><a href="#cb1-31" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Explore commonly seen random variables like Bernoulli and Binomial distributions</span>
+<span id="cb1-32"><a href="#cb1-32" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Apply the Central Limit Theorem to approximate parameters of a population</span>
+<span id="cb1-33"><a href="#cb1-33" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Use sampled data to model an estimation of and infer the true underlying distribution</span>
+<span id="cb1-34"><a href="#cb1-34" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Estimate the true population distribution from a sample using the bootstrapping technique</span>
+<span id="cb1-35"><a href="#cb1-35" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-36"><a href="#cb1-36" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-37"><a href="#cb1-37" aria-hidden="true" tabindex="-1"></a>Last time, we introduced the idea of random variables: numerical functions of a sample. Most of our work in the last lecture was done to build a background in probability and statistics. Now that we've established some key ideas, we're in a good place to apply what we've learned to our original goal -- understanding how the randomness of a sample impacts the model design process. </span>
+<span id="cb1-38"><a href="#cb1-38" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-39"><a href="#cb1-39" aria-hidden="true" tabindex="-1"></a>In this lecture, we will delve more deeply into the idea of fitting a model to a sample. We'll explore how to re-express our modeling process in terms of random variables and use this new understanding to steer model complexity. </span>
+<span id="cb1-40"><a href="#cb1-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-41"><a href="#cb1-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-42"><a href="#cb1-42" aria-hidden="true" tabindex="-1"></a><span class="fu">## Common Random Variables</span></span>
+<span id="cb1-43"><a href="#cb1-43" aria-hidden="true" tabindex="-1"></a>There are several cases of random variables that appear often and have useful properties. Below are the ones we will explore further in this course. The numbers in parentheses are the parameters of a random variable, which are constants. Parameters define a random variable’s shape (i.e., distribution) and its values. For this lecture, we'll focus more heavily on the bolded random variables and their special properties, but you should familiarize yourself with all the ones listed below: </span>
+<span id="cb1-44"><a href="#cb1-44" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-45"><a href="#cb1-45" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Bernoulli($p$)**</span>
+<span id="cb1-46"><a href="#cb1-46" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>Takes on value 1 with probability $p$, and 0 with probability $(1 - p)$.</span>
+<span id="cb1-47"><a href="#cb1-47" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>AKA the “indicator” random variable.</span>
+<span id="cb1-48"><a href="#cb1-48" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>Let $X$ be a Bernoulli($p$) random variable.</span>
+<span id="cb1-49"><a href="#cb1-49" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = 1 * p + 0 * (1-p) = p$</span>
+<span id="cb1-50"><a href="#cb1-50" aria-hidden="true" tabindex="-1"></a><span class="ss">        * </span>$\mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> = 1^2 * p + 0 * (1-p) = p$</span>
+<span id="cb1-51"><a href="#cb1-51" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\text{Var}(X) = \mathbb{E}<span class="co">[</span><span class="ot">X^2</span><span class="co">]</span> - (\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span>)^2 = p - p^2 = p(1-p)$</span>
+<span id="cb1-52"><a href="#cb1-52" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Binomial($n$, $p$)**</span>
+<span id="cb1-53"><a href="#cb1-53" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>Number of 1s in $n$ independent Bernoulli($p$) trials.</span>
+<span id="cb1-54"><a href="#cb1-54" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>Let $Y$ be a Binomial($n$, $p$) random variable.</span>
+<span id="cb1-55"><a href="#cb1-55" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>The distribution of $Y$ is given by the binomial formula, and we can write $Y = \sum_{i=1}^n X_i$ where:</span>
+<span id="cb1-56"><a href="#cb1-56" aria-hidden="true" tabindex="-1"></a><span class="ss">        * </span>$X_i$ s the indicator of success on trial i. $X_i = 1$ if trial i is a success, else 0.</span>
+<span id="cb1-57"><a href="#cb1-57" aria-hidden="true" tabindex="-1"></a><span class="ss">        * </span>All $X_i$ are i.i.d. and Bernoulli($p$).</span>
+<span id="cb1-58"><a href="#cb1-58" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\mathbb{E}<span class="co">[</span><span class="ot">Y</span><span class="co">]</span> = \sum_{i=1}^n \mathbb{E}<span class="co">[</span><span class="ot">X_i</span><span class="co">]</span> = np$</span>
+<span id="cb1-59"><a href="#cb1-59" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>$\text{Var}(X) = \sum_{i=1}^n \text{Var}(X_i) = np(1-p)$ </span>
+<span id="cb1-60"><a href="#cb1-60" aria-hidden="true" tabindex="-1"></a><span class="ss">      * </span>$X_i$'s are independent, so $\text{Cov}(X_i, X_j) = 0$ for all i, j.</span>
+<span id="cb1-61"><a href="#cb1-61" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Uniform on a finite set of values</span>
+<span id="cb1-62"><a href="#cb1-62" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>The probability of each value is $\frac{1}{\text{(number of possible values)}}$.</span>
+<span id="cb1-63"><a href="#cb1-63" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>For example, a standard/fair die.</span>
+<span id="cb1-64"><a href="#cb1-64" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Uniform on the unit interval (0, 1)</span>
+<span id="cb1-65"><a href="#cb1-65" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>Density is flat at 1 on (0, 1) and 0 elsewhere.</span>
+<span id="cb1-66"><a href="#cb1-66" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Normal($\mu, \sigma^2$), a.k.a Gaussian</span>
+<span id="cb1-67"><a href="#cb1-67" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span>$f(x) = \frac{1}{\sigma\sqrt{2\pi}} \exp\left( -\frac{1}{2}\left(\frac{x-\mu}{\sigma}\right)^{<span class="sc">\!</span>2}\,\right)$</span>
+<span id="cb1-68"><a href="#cb1-68" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-69"><a href="#cb1-69" aria-hidden="true" tabindex="-1"></a><span class="fu">### Example</span></span>
+<span id="cb1-70"><a href="#cb1-70" aria-hidden="true" tabindex="-1"></a>Suppose you win cash based on the number of heads you get in a series of 20 coin flips.</span>
+<span id="cb1-71"><a href="#cb1-71" aria-hidden="true" tabindex="-1"></a>Let $X_i = 1$ if the $i$-th coin is heads, $0$ otherwise. Which payout strategy would you choose?</span>
+<span id="cb1-72"><a href="#cb1-72" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-73"><a href="#cb1-73" aria-hidden="true" tabindex="-1"></a>A. $Y_A = 10 * X_1 + 10 * X_2$</span>
+<span id="cb1-74"><a href="#cb1-74" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-75"><a href="#cb1-75" aria-hidden="true" tabindex="-1"></a>B. $Y_B = \sum_{i=1}^{20} X_i$</span>
+<span id="cb1-76"><a href="#cb1-76" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-77"><a href="#cb1-77" aria-hidden="true" tabindex="-1"></a>C. $Y_C = 20 * X_1$</span>
+<span id="cb1-78"><a href="#cb1-78" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-79"><a href="#cb1-79" aria-hidden="true" tabindex="-1"></a>::: {.callout-caution collapse="true"}</span>
+<span id="cb1-80"><a href="#cb1-80" aria-hidden="true" tabindex="-1"></a><span class="fu">## Solution</span></span>
+<span id="cb1-81"><a href="#cb1-81" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-82"><a href="#cb1-82" aria-hidden="true" tabindex="-1"></a>Let $X_1, X_2, ... X_{20}$ be 20 i.i.d Bernoulli(0.5) random variables. Since the $X_i$'s are independent, $\text{Cov}(X_i, X_j) = 0$ for all pairs $i, j$. Additionally, Since $X_i$ is Bernoulli(0.5), we know that $\mathbb{E}<span class="co">[</span><span class="ot">X</span><span class="co">]</span> = p = 0.5$ and $\text{Var}(X) = p(1-p) = 0.25$. We can calculate the following for each scenario: </span>
+<span id="cb1-83"><a href="#cb1-83" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-84"><a href="#cb1-84" aria-hidden="true" tabindex="-1"></a>|   | A. $Y_A = 10 * X_1 + 10 * X_2$ | B. $Y_B = \sum_{i=1}^{20} X_i$ | C. $Y_C = 20 * X_1$ | </span>
+<span id="cb1-85"><a href="#cb1-85" aria-hidden="true" tabindex="-1"></a>| --- | --- | --- | --- |</span>
+<span id="cb1-86"><a href="#cb1-86" aria-hidden="true" tabindex="-1"></a>| Expectation | $\mathbb{E}<span class="co">[</span><span class="ot">Y_A</span><span class="co">]</span> = 10 (0.5) + 10(0.5) = 10$ | $\mathbb{E}<span class="co">[</span><span class="ot">Y_B</span><span class="co">]</span> = 0.5 + ... + 0.5 = 10$ | $\mathbb{E}<span class="co">[</span><span class="ot">Y_C</span><span class="co">]</span> = 20(0.5) = 10$  | </span>
+<span id="cb1-87"><a href="#cb1-87" aria-hidden="true" tabindex="-1"></a>| Variance | $\text{Var}(Y_A) = 10^2 (0.25) + 10^2 (0.25) = 50$ | $\text{Var}(Y_B) = 0.25 + ... + 0.25 = 5$ | $\text{Var}(Y_C) = 20^2 (0.25) = 100$ | </span>
+<span id="cb1-88"><a href="#cb1-88" aria-hidden="true" tabindex="-1"></a>| Standard Deviation | $\text{SD}(Y_A) \approx 7.07$ | $\text{SD}(Y_B) \approx 2.24$ | $\text{SD}(Y_C) = 10$  | </span>
+<span id="cb1-89"><a href="#cb1-89" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-90"><a href="#cb1-90" aria-hidden="true" tabindex="-1"></a>As we can see, all the scenarios have the same expected value but different variances. The higher the variance, the greater the risk and uncertainty, so the "right" strategy depends on your personal preference. Would you choose the "safest" option B, the most "risky" option C, or somewhere in the middle (option A)?</span>
+<span id="cb1-91"><a href="#cb1-91" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-92"><a href="#cb1-92" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-93"><a href="#cb1-93" aria-hidden="true" tabindex="-1"></a><span class="fu">## Sample Statistics </span></span>
+<span id="cb1-94"><a href="#cb1-94" aria-hidden="true" tabindex="-1"></a>Today, we've talked extensively about populations; if we know the distribution of a random variable, we can reliably compute expectation, variance, functions of the random variable, etc. </span>
+<span id="cb1-95"><a href="#cb1-95" aria-hidden="true" tabindex="-1"></a>Note that:</span>
+<span id="cb1-96"><a href="#cb1-96" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-97"><a href="#cb1-97" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>The distribution of a *population* describes how a random variable behaves across *all* individuals of interest. </span>
+<span id="cb1-98"><a href="#cb1-98" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>The distribution of a *sample* describes how a random variable behaves in a *specific sample* from the population. </span>
+<span id="cb1-99"><a href="#cb1-99" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-100"><a href="#cb1-100" aria-hidden="true" tabindex="-1"></a>In Data Science, however, we often do not have access to the whole population, so we don’t know its distribution. As such, we need to collect a sample and use its distribution to estimate or infer properties of the population. In cases like these, we can take several samples of size $n$ from the population (an easy way to do this is using <span class="in">`df.sample(n, replace=True)`</span>), and compute the mean of each *sample*. When sampling, we make the (big) assumption that we sample uniformly at random *with replacement* from the population; each observation in our sample is a random variable drawn i.i.d from our population distribution. Remember that our sample mean is a random variable since it depends on our randomly drawn sample! On the other hand, our population mean is simply a number (a fixed value).</span>
+<span id="cb1-101"><a href="#cb1-101" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-102"><a href="#cb1-102" aria-hidden="true" tabindex="-1"></a><span class="fu">### Sample Mean </span></span>
+<span id="cb1-103"><a href="#cb1-103" aria-hidden="true" tabindex="-1"></a>Consider an i.i.d. sample $X_1, X_2, ..., X_n$ drawn from a population with mean 𝜇 and SD 𝜎.</span>
+<span id="cb1-104"><a href="#cb1-104" aria-hidden="true" tabindex="-1"></a>We define the sample mean as $$\bar{X}_n = \frac{1}{n} \sum_{i=1}^n X_i$$</span>
+<span id="cb1-105"><a href="#cb1-105" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-106"><a href="#cb1-106" aria-hidden="true" tabindex="-1"></a>The expectation of the sample mean is given by: </span>
+<span id="cb1-107"><a href="#cb1-107" aria-hidden="true" tabindex="-1"></a>$$\begin{align} </span>
+<span id="cb1-108"><a href="#cb1-108" aria-hidden="true" tabindex="-1"></a>    \mathbb{E}<span class="co">[</span><span class="ot">\bar{X}_n</span><span class="co">]</span> &amp;= \frac{1}{n} \sum_{i=1}^n \mathbb{E}<span class="co">[</span><span class="ot">X_i</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-109"><a href="#cb1-109" aria-hidden="true" tabindex="-1"></a>    &amp;= \frac{1}{n} (n \mu) <span class="sc">\\</span></span>
+<span id="cb1-110"><a href="#cb1-110" aria-hidden="true" tabindex="-1"></a>    &amp;= \mu </span>
+<span id="cb1-111"><a href="#cb1-111" aria-hidden="true" tabindex="-1"></a>\end{align}$$</span>
+<span id="cb1-112"><a href="#cb1-112" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-113"><a href="#cb1-113" aria-hidden="true" tabindex="-1"></a>The variance is given by: </span>
+<span id="cb1-114"><a href="#cb1-114" aria-hidden="true" tabindex="-1"></a>$$\begin{align} </span>
+<span id="cb1-115"><a href="#cb1-115" aria-hidden="true" tabindex="-1"></a>    \text{Var}(\bar{X}_n) &amp;= \frac{1}{n^2} \text{Var}( \sum_{i=1}^n X_i) <span class="sc">\\</span></span>
+<span id="cb1-116"><a href="#cb1-116" aria-hidden="true" tabindex="-1"></a>    &amp;=  \frac{1}{n^2} \left( \sum_{i=1}^n \text{Var}(X_i) \right) <span class="sc">\\</span></span>
+<span id="cb1-117"><a href="#cb1-117" aria-hidden="true" tabindex="-1"></a>    &amp;=  \frac{1}{n^2} (n \sigma^2) = \frac{\sigma^2}{n}</span>
+<span id="cb1-118"><a href="#cb1-118" aria-hidden="true" tabindex="-1"></a>\end{align}$$</span>
+<span id="cb1-119"><a href="#cb1-119" aria-hidden="true" tabindex="-1"></a> </span>
+<span id="cb1-120"><a href="#cb1-120" aria-hidden="true" tabindex="-1"></a>$\bar{X}_n$ is approximately normally distributed by the Central Limit Theorem (CLT).</span>
+<span id="cb1-121"><a href="#cb1-121" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-122"><a href="#cb1-122" aria-hidden="true" tabindex="-1"></a><span class="fu">### Central Limit Theorem</span></span>
+<span id="cb1-123"><a href="#cb1-123" aria-hidden="true" tabindex="-1"></a>In <span class="co">[</span><span class="ot">Data 8</span><span class="co">](https://inferentialthinking.com/chapters/14/4/Central_Limit_Theorem.html?)</span> and in the previous lecture, you encountered the **Central Limit Theorem (CLT)**. </span>
+<span id="cb1-124"><a href="#cb1-124" aria-hidden="true" tabindex="-1"></a>This is a powerful theorem for estimating the distribution of a population with mean $\mu$ and standard deviation $\sigma$ from a collection of smaller samples. The CLT tells us that if an i.i.d sample of size $n$ is large, then the probability distribution of the **sample mean** is **roughly normal** with mean $\mu$ and SD of $\frac{\sigma}{\sqrt{n}}$. More generally, any theorem that provides the rough distribution of a statistic and **doesn't need the distribution of the population** is valuable to data scientists! This is because we rarely know a lot about the population. </span>
+<span id="cb1-125"><a href="#cb1-125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-126"><a href="#cb1-126" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-127"><a href="#cb1-127" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-128"><a href="#cb1-128" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/clt.png" alt='clt' width='400'&gt;</span>
+<span id="cb1-129"><a href="#cb1-129" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-130"><a href="#cb1-130" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-131"><a href="#cb1-131" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-132"><a href="#cb1-132" aria-hidden="true" tabindex="-1"></a>Importantly, the CLT assumes that each observation in our samples is drawn i.i.d from the distribution of the population. In addition, the CLT is accurate only when $n$ is "large", but what counts as a "large" sample size depends on the specific distribution. If a population is highly symmetric and unimodal, we could need as few as $n=20$; if a population is very skewed, we need a larger $n$. If in doubt, you can bootstrap the sample mean and see if the bootstrapped distribution is bell-shaped. Classes like Data 140 investigate this idea in great detail.</span>
+<span id="cb1-133"><a href="#cb1-133" aria-hidden="true" tabindex="-1"></a><span class="co">&lt;!-- The CLT states that no matter what population you are drawing from, if an i.i.d. sample of size $n$ is large, the probability distribution of the sample mean is roughly normal with mean 𝜇 and SD $\sigma/\sqrt{n}$. --&gt;</span></span>
+<span id="cb1-134"><a href="#cb1-134" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-135"><a href="#cb1-135" aria-hidden="true" tabindex="-1"></a>For a more in-depth demo, check out <span class="co">[</span><span class="ot">onlinestatbook</span><span class="co">](https://onlinestatbook.com/stat_sim/sampling_dist/)</span>. </span>
+<span id="cb1-136"><a href="#cb1-136" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-137"><a href="#cb1-137" aria-hidden="true" tabindex="-1"></a><span class="fu">### Using the Sample Mean to Estimate the Population Mean</span></span>
+<span id="cb1-138"><a href="#cb1-138" aria-hidden="true" tabindex="-1"></a>Now let's say we want to use the sample mean to **estimate** the population mean, for example, the average height of Cal undergraduates. We can typically collect a **single sample**, which has just one average. However, what if we happened, by random chance, to draw a sample with a different mean or spread than that of the population? We might get a skewed view of how the population behaves (consider the extreme case where we happen to sample the exact same value $n$ times!). </span>
+<span id="cb1-139"><a href="#cb1-139" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-140"><a href="#cb1-140" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-141"><a href="#cb1-141" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-142"><a href="#cb1-142" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/CLTdiff.png" alt='clt' width='400'&gt;</span>
+<span id="cb1-143"><a href="#cb1-143" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-144"><a href="#cb1-144" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-145"><a href="#cb1-145" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-146"><a href="#cb1-146" aria-hidden="true" tabindex="-1"></a>For example, notice the difference in variation between these two distributions that are different in sample size. The distribution with a bigger sample size ($n=800$) is tighter around the mean than the distribution with a smaller sample size ($n=200$). Try plugging in these values into the standard deviation equation for the sample mean to make sense of this! </span>
+<span id="cb1-147"><a href="#cb1-147" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-148"><a href="#cb1-148" aria-hidden="true" tabindex="-1"></a>Applying the CLT allows us to make sense of all of this and resolve this issue. By drawing many samples, we can consider how the sample distribution varies across multiple subsets of the data. This allows us to approximate the properties of the population without the need to survey every single member. </span>
+<span id="cb1-149"><a href="#cb1-149" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-150"><a href="#cb1-150" aria-hidden="true" tabindex="-1"></a>Given this potential variance, it is also important that we consider the **average value and spread** of all possible sample means, and what this means for how big $n$ should be. For every sample size, the expected value of the sample mean is the population mean: $$\mathbb{E}[\bar{X}_n] = \mu$$ We call the sample mean an **unbiased estimator** of the population mean and will explore this idea more in the next lecture. </span>
+<span id="cb1-151"><a href="#cb1-151" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-152"><a href="#cb1-152" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-153"><a href="#cb1-153" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip collapse="false"}</span>
+<span id="cb1-154"><a href="#cb1-154" aria-hidden="true" tabindex="-1"></a><span class="fu">## Data 8 Recap: Square Root Law</span></span>
+<span id="cb1-155"><a href="#cb1-155" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-156"><a href="#cb1-156" aria-hidden="true" tabindex="-1"></a>The square root law (<span class="co">[</span><span class="ot">Data 8</span><span class="co">](https://inferentialthinking.com/chapters/14/5/Variability_of_the_Sample_Mean.html#the-square-root-law)</span>) states that if you increase the sample size by a factor, the SD of the sample mean decreases by the square root of the factor. $$\text{SD}(\bar{X_n}) = \frac{\sigma}{\sqrt{n}}$$ The sample mean is more likely to be close to the population mean if we have a larger sample size.</span>
+<span id="cb1-157"><a href="#cb1-157" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-158"><a href="#cb1-158" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-159"><a href="#cb1-159" aria-hidden="true" tabindex="-1"></a><span class="fu">## Prediction and Inference</span></span>
+<span id="cb1-160"><a href="#cb1-160" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-161"><a href="#cb1-161" aria-hidden="true" tabindex="-1"></a>At this point in the course, we've spent a great deal of time working with models. When we first introduced the idea of modeling a few weeks ago, we did so in the context of **prediction**: using models to make *accurate predictions* about unseen data. Another reason we might build models is to better understand complex phenomena in the world around us. **Inference** is the task of using a model to infer the true underlying relationships between the feature and response variables. For example, if we are working with a set of housing data, *prediction* might ask: given the attributes of a house, how much is it worth? *Inference* might ask: how much does having a local park impact the value of a house?</span>
+<span id="cb1-162"><a href="#cb1-162" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-163"><a href="#cb1-163" aria-hidden="true" tabindex="-1"></a>A major goal of inference is to draw conclusions about the full population of data given only a random sample. To do this, we aim to estimate the value of a **parameter**, which is a numerical function of the *population* (for example, the population mean $\mu$). We use a collected sample to construct a **statistic**, which is a numerical function of the random *sample* (for example, the sample mean $\bar{X}_n$). It's helpful to think "p" for "parameter" and "population," and "s" for "sample" and "statistic."</span>
+<span id="cb1-164"><a href="#cb1-164" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-165"><a href="#cb1-165" aria-hidden="true" tabindex="-1"></a>Since the sample represents a *random* subset of the population, any statistic we generate will likely deviate from the true population parameter, and it *could have been different*. We say that the sample statistic is an **estimator** of the true population parameter. Notationally, the population parameter is typically called $\theta$, while its estimator is denoted by $\hat{\theta}$.</span>
+<span id="cb1-166"><a href="#cb1-166" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-167"><a href="#cb1-167" aria-hidden="true" tabindex="-1"></a>To address our inference question, we aim to construct estimators that closely estimate the value of the population parameter. We evaluate how "good" an estimator is by answering three questions:</span>
+<span id="cb1-168"><a href="#cb1-168" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-169"><a href="#cb1-169" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>How close is our answer to the parameter? **(Risk / MSE)**</span>
+<span id="cb1-170"><a href="#cb1-170" aria-hidden="true" tabindex="-1"></a>  $$ MSE(\hat{\theta}) = E<span class="co">[</span><span class="ot">(\hat{\theta} - \theta)</span><span class="co">]</span>^2$$</span>
+<span id="cb1-171"><a href="#cb1-171" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Do we get the right answer for the parameter, on average? **(Bias)** </span>
+<span id="cb1-172"><a href="#cb1-172" aria-hidden="true" tabindex="-1"></a>  $$\text{Bias}(\hat{\theta}) = E<span class="co">[</span><span class="ot">\hat{\theta} - \theta</span><span class="co">]</span> = E<span class="co">[</span><span class="ot">\hat{\theta}</span><span class="co">]</span> - \theta$$</span>
+<span id="cb1-173"><a href="#cb1-173" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>How variable is the answer? **(Variance)**</span>
+<span id="cb1-174"><a href="#cb1-174" aria-hidden="true" tabindex="-1"></a>  $$Var(\hat{\theta}) = E<span class="co">[</span><span class="ot">(\theta - E[\theta])^2</span><span class="co">]</span> $$</span>
+<span id="cb1-175"><a href="#cb1-175" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-176"><a href="#cb1-176" aria-hidden="true" tabindex="-1"></a>This relationship can be illustrated with an archery analogy. Imagine that the center of the target is the $\theta$ and each arrow corresponds to a separate parameter estimate $\hat{\theta}$</span>
+<span id="cb1-177"><a href="#cb1-177" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-178"><a href="#cb1-178" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-179"><a href="#cb1-179" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-180"><a href="#cb1-180" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/bias_v_variance.png" width='600'&gt;</span>
+<span id="cb1-181"><a href="#cb1-181" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-182"><a href="#cb1-182" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-183"><a href="#cb1-183" aria-hidden="true" tabindex="-1"></a>Ideally, we want our estimator to have low bias and low variance, but how can we mathematically quantify that? See @sec-bias-variance-tradeoff for more detail.</span>
+<span id="cb1-184"><a href="#cb1-184" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-185"><a href="#cb1-185" aria-hidden="true" tabindex="-1"></a><span class="fu">### Prediction as Estimation</span></span>
+<span id="cb1-186"><a href="#cb1-186" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-187"><a href="#cb1-187" aria-hidden="true" tabindex="-1"></a>Now that we've established the idea of an estimator, let's see how we can apply this learning to the modeling process. To do so, we'll take a moment to formalize our data collection and models in the language of random variables.</span>
+<span id="cb1-188"><a href="#cb1-188" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-189"><a href="#cb1-189" aria-hidden="true" tabindex="-1"></a>Say we are working with an input variable, $x$, and a response variable, $Y$. We assume that $Y$ and $x$ are linked by some relationship $g$; in other words, $Y = g(x)$ where $g$ represents some "universal truth" or "law of nature" that defines the underlying relationship between $x$ and $Y$. In the image below, $g$ is denoted by the red line.</span>
+<span id="cb1-190"><a href="#cb1-190" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-191"><a href="#cb1-191" aria-hidden="true" tabindex="-1"></a>As data scientists, however, we have no way of directly "seeing" the underlying relationship $g$. The best we can do is collect observed data out in the real world to try to understand this relationship. Unfortunately, the data collection process will always have some inherent error (think of the randomness you might encounter when taking measurements in a scientific experiment). We say that each observation comes with some random error or **noise** term, $\epsilon$ (read: "epsilon"). This error is assumed to be a random variable with expectation $\mathbb{E}(\epsilon)=0$, variance $\text{Var}(\epsilon) = \sigma^2$, and be i.i.d. across each observation. The existence of this random noise means that our observations, $Y(x)$, are *random variables*.</span>
+<span id="cb1-192"><a href="#cb1-192" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-193"><a href="#cb1-193" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-194"><a href="#cb1-194" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/data.png" alt='data' width='700'&gt;</span>
+<span id="cb1-195"><a href="#cb1-195" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-196"><a href="#cb1-196" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-197"><a href="#cb1-197" aria-hidden="true" tabindex="-1"></a>We can only observe our random sample of data, represented by the blue points above. From this sample, we want to estimate the true relationship $g$. We do this by constructing the model $\hat{Y}(x)$ to estimate $g$. </span>
+<span id="cb1-198"><a href="#cb1-198" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-199"><a href="#cb1-199" aria-hidden="true" tabindex="-1"></a>$$\text{True relationship: } g(x)$$</span>
+<span id="cb1-200"><a href="#cb1-200" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-201"><a href="#cb1-201" aria-hidden="true" tabindex="-1"></a>$$\text{Observed relationship: }Y = g(x) + \epsilon$$</span>
+<span id="cb1-202"><a href="#cb1-202" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-203"><a href="#cb1-203" aria-hidden="true" tabindex="-1"></a>$$\text{Prediction: }\hat{Y}(x)$$</span>
+<span id="cb1-204"><a href="#cb1-204" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-205"><a href="#cb1-205" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-206"><a href="#cb1-206" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/y_hat.png" alt='y_hat' width='600'&gt;</span>
+<span id="cb1-207"><a href="#cb1-207" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-208"><a href="#cb1-208" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-209"><a href="#cb1-209" aria-hidden="true" tabindex="-1"></a>When building models, it is also important to note that our choice of features will also significantly impact our estimation. In the plot below, you can see how the different models (green and purple) can lead to different estimates.</span>
+<span id="cb1-210"><a href="#cb1-210" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-211"><a href="#cb1-211" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-212"><a href="#cb1-212" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/y_hat2.png" alt='y_hat' width='400'&gt;</span>
+<span id="cb1-213"><a href="#cb1-213" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-214"><a href="#cb1-214" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-215"><a href="#cb1-215" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Estimating a Linear Relationship</span></span>
+<span id="cb1-216"><a href="#cb1-216" aria-hidden="true" tabindex="-1"></a>If we assume that the true relationship $g$ is linear, we can express the response as $Y = f_{\theta}(x)$, where our true relationship is modeled by </span>
+<span id="cb1-217"><a href="#cb1-217" aria-hidden="true" tabindex="-1"></a>$$Y = g(x) + \epsilon$$</span>
+<span id="cb1-218"><a href="#cb1-218" aria-hidden="true" tabindex="-1"></a>$$ f_{\theta}(x) = Y = \theta_0 + \sum_{j=1}^p \theta_j x_j + \epsilon$$</span>
+<span id="cb1-219"><a href="#cb1-219" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-220"><a href="#cb1-220" aria-hidden="true" tabindex="-1"></a>::: {.callout-warning collapse="true"}</span>
+<span id="cb1-221"><a href="#cb1-221" aria-hidden="true" tabindex="-1"></a><span class="fu">##### Which expressions are random? </span></span>
+<span id="cb1-222"><a href="#cb1-222" aria-hidden="true" tabindex="-1"></a>In our two equations above, the true relationship $g(x) = \theta_0 + \sum_{j=1}^p \theta_j x_j$ is not random, but $\epsilon$ is random. Hence, $Y = f_{\theta}(x)$ is also random. </span>
+<span id="cb1-223"><a href="#cb1-223" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-224"><a href="#cb1-224" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-225"><a href="#cb1-225" aria-hidden="true" tabindex="-1"></a>This true relationship has true, unobservable parameters $\theta$, and it has random noise $\epsilon$, so we can never observe the true relationship. Instead, the next best thing we can do is obtain a sample $\Bbb{X}$, $\Bbb{Y}$ of $n$ observed relationships, $(x, Y)$ and use it to train a model and obtain an estimate of $\hat{\theta}$</span>
+<span id="cb1-226"><a href="#cb1-226" aria-hidden="true" tabindex="-1"></a>$$\hat{Y}(x) = f_{\hat{\theta}}(x) = \hat{\theta_0} + \sum_{j=1}^p \hat{\theta_j} x_j$$</span>
+<span id="cb1-227"><a href="#cb1-227" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-228"><a href="#cb1-228" aria-hidden="true" tabindex="-1"></a>::: {.callout-warning collapse="true"}</span>
+<span id="cb1-229"><a href="#cb1-229" aria-hidden="true" tabindex="-1"></a><span class="fu">##### Which expressions are random? </span></span>
+<span id="cb1-230"><a href="#cb1-230" aria-hidden="true" tabindex="-1"></a>In our estimating equation above, our sample $\Bbb{X}$, $\Bbb{Y}$ are random (often due to human error). Hence, the estimates we calculate from our samples $\hat{\theta}$ are also random, so our predictor $\hat{Y}(x)$ is also random. </span>
+<span id="cb1-231"><a href="#cb1-231" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-232"><a href="#cb1-232" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-233"><a href="#cb1-233" aria-hidden="true" tabindex="-1"></a>Now taking a look at our original equations, we can see that they both have differing sources of randomness. For our observed relationship, $Y = g(x) + \epsilon$, $\epsilon$ represents errors which occur during or after the observation or measurement process. For the estimation model, the data we have is a random sample collected from the population, which was constructed from decisions made before the measurement process.</span>
+<span id="cb1-234"><a href="#cb1-234" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-235"><a href="#cb1-235" aria-hidden="true" tabindex="-1"></a><span class="fu">## Bias-Variance Tradeoff {#sec-bias-variance-tradeoff}</span></span>
+<span id="cb1-236"><a href="#cb1-236" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-237"><a href="#cb1-237" aria-hidden="true" tabindex="-1"></a>Recall the model and the data we generated from that model in the last section:</span>
+<span id="cb1-238"><a href="#cb1-238" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-239"><a href="#cb1-239" aria-hidden="true" tabindex="-1"></a>$$\text{True relationship: } g(x)$$</span>
+<span id="cb1-240"><a href="#cb1-240" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-241"><a href="#cb1-241" aria-hidden="true" tabindex="-1"></a>$$\text{Observed relationship: }Y = g(x) + \epsilon$$</span>
+<span id="cb1-242"><a href="#cb1-242" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-243"><a href="#cb1-243" aria-hidden="true" tabindex="-1"></a>$$\text{Prediction: }\hat{Y}(x)$$</span>
+<span id="cb1-244"><a href="#cb1-244" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-245"><a href="#cb1-245" aria-hidden="true" tabindex="-1"></a>With this reformulated modeling goal, we can now revisit the Bias-Variance Tradeoff from two lectures ago (shown below): </span>
+<span id="cb1-246"><a href="#cb1-246" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-247"><a href="#cb1-247" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-248"><a href="#cb1-248" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/bvt_old.png" width='600'&gt;</span>
+<span id="cb1-249"><a href="#cb1-249" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-250"><a href="#cb1-250" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-251"><a href="#cb1-251" aria-hidden="true" tabindex="-1"></a>In today's lecture, we'll explore a more mathematical version of the graph you see above by introducing the terms model risk, observation variance, model bias, and model variance. Eventually, we'll work our way up to an updated version of the Bias-Variance Tradeoff graph that you see below </span>
+<span id="cb1-252"><a href="#cb1-252" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-253"><a href="#cb1-253" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-254"><a href="#cb1-254" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/bvt.png" width='600'&gt;</span>
+<span id="cb1-255"><a href="#cb1-255" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-256"><a href="#cb1-256" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-257"><a href="#cb1-257" aria-hidden="true" tabindex="-1"></a><span class="fu">### Model Risk</span></span>
+<span id="cb1-258"><a href="#cb1-258" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-259"><a href="#cb1-259" aria-hidden="true" tabindex="-1"></a>**Model risk** is defined as the mean square prediction error of the random variable $\hat{Y}$. It is an expectation across *all* samples we could have possibly gotten when fitting the model, which we can denote as random variables $X_1, X_2, \ldots, X_n, Y$. Model risk considers the model's performance on any sample that is theoretically possible, rather than the specific data that we have collected. </span>
+<span id="cb1-260"><a href="#cb1-260" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-261"><a href="#cb1-261" aria-hidden="true" tabindex="-1"></a>$$\text{model risk }=E\left<span class="co">[</span><span class="ot">(Y-\hat{Y(x)})^2\right</span><span class="co">]</span>$$ </span>
+<span id="cb1-262"><a href="#cb1-262" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-263"><a href="#cb1-263" aria-hidden="true" tabindex="-1"></a>What is the origin of the error encoded by model risk? Note that there are two types of errors:</span>
+<span id="cb1-264"><a href="#cb1-264" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-265"><a href="#cb1-265" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Chance errors: happen due to randomness alone</span>
+<span id="cb1-266"><a href="#cb1-266" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Source 1 **(Observation Variance)**: randomness in new observations $Y$ due to random noise $\epsilon$</span>
+<span id="cb1-267"><a href="#cb1-267" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Source 2 **(Model Variance)**: randomness in the sample we used to train the models, as samples $X_1, X_2, \ldots, X_n, Y$ are random</span>
+<span id="cb1-268"><a href="#cb1-268" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**(Model Bias)**: non-random error due to our model being different from the true underlying function $g$</span>
+<span id="cb1-269"><a href="#cb1-269" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-270"><a href="#cb1-270" aria-hidden="true" tabindex="-1"></a>Recall the data-generating process we established earlier. There is a true underlying relationship $g$, observed data (with random noise) $Y$, and model $\hat{Y}$. </span>
+<span id="cb1-271"><a href="#cb1-271" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-272"><a href="#cb1-272" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-273"><a href="#cb1-273" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/errors.png" alt='errors' width='600'&gt;</span>
+<span id="cb1-274"><a href="#cb1-274" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-275"><a href="#cb1-275" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-276"><a href="#cb1-276" aria-hidden="true" tabindex="-1"></a>To better understand model risk, we'll zoom in on a single data point in the plot above.</span>
+<span id="cb1-277"><a href="#cb1-277" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-278"><a href="#cb1-278" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-279"><a href="#cb1-279" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/breakdown.png" alt='breakdown' width='600'&gt;</span>
+<span id="cb1-280"><a href="#cb1-280" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-281"><a href="#cb1-281" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-282"><a href="#cb1-282" aria-hidden="true" tabindex="-1"></a>Remember that $\hat{Y}(x)$ is a random variable – it is the prediction made for $x$ after being fit on the specific sample used for training. If we had used a different sample for training, a different prediction might have been made for this value of $x$. To capture this, the diagram above considers both the prediction $\hat{Y}(x)$ made for a particular random training sample, and the *expected* prediction across all possible training samples, $E<span class="co">[</span><span class="ot">\hat{Y}(x)</span><span class="co">]</span>$. </span>
+<span id="cb1-283"><a href="#cb1-283" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-284"><a href="#cb1-284" aria-hidden="true" tabindex="-1"></a>We can use this simplified diagram to break down the prediction error into smaller components. First, start by considering the error on a single prediction, $Y(x)-\hat{Y}(x)$.</span>
+<span id="cb1-285"><a href="#cb1-285" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-286"><a href="#cb1-286" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-287"><a href="#cb1-287" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/error.png" alt='error' width='600'&gt;</span>
+<span id="cb1-288"><a href="#cb1-288" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-289"><a href="#cb1-289" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-290"><a href="#cb1-290" aria-hidden="true" tabindex="-1"></a>We can identify three components of this error.</span>
+<span id="cb1-291"><a href="#cb1-291" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-292"><a href="#cb1-292" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-293"><a href="#cb1-293" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/decomposition.png" alt='decomposition' width='600'&gt;</span>
+<span id="cb1-294"><a href="#cb1-294" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-295"><a href="#cb1-295" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-296"><a href="#cb1-296" aria-hidden="true" tabindex="-1"></a>That is, the error can be written as:</span>
+<span id="cb1-297"><a href="#cb1-297" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-298"><a href="#cb1-298" aria-hidden="true" tabindex="-1"></a>$$Y(x)-\hat{Y}(x) = \epsilon + \left(g(x)-E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span>\right) + \left(E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span> - \hat{Y}(x)\right)$$</span>
+<span id="cb1-299"><a href="#cb1-299" aria-hidden="true" tabindex="-1"></a>$$\newline   $$</span>
+<span id="cb1-300"><a href="#cb1-300" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-301"><a href="#cb1-301" aria-hidden="true" tabindex="-1"></a>The model risk is the expected square of the expression above, $E\left<span class="co">[</span><span class="ot">(Y(x)-\hat{Y}(x))^2\right</span><span class="co">]</span>$. If we square both sides and then take the expectation, we will get the following decomposition of model risk:</span>
+<span id="cb1-302"><a href="#cb1-302" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-303"><a href="#cb1-303" aria-hidden="true" tabindex="-1"></a>$$E\left<span class="co">[</span><span class="ot">(Y(x)-\hat{Y}(x))^2\right</span><span class="co">]</span> = E<span class="co">[</span><span class="ot">\epsilon^2</span><span class="co">]</span> + \left(g(x)-E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span>\right)^2 + E\left<span class="co">[</span><span class="ot">\left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)^2\right</span><span class="co">]</span>$$</span>
+<span id="cb1-304"><a href="#cb1-304" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-305"><a href="#cb1-305" aria-hidden="true" tabindex="-1"></a>It looks like we are missing some cross-product terms when squaring the right-hand side, but it turns out that all of those cross-product terms are zero. The detailed derivation is out of scope for this class, but a proof is included at the end of this note for your reference.</span>
+<span id="cb1-306"><a href="#cb1-306" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-307"><a href="#cb1-307" aria-hidden="true" tabindex="-1"></a>This expression may look complicated at first glance, but we've actually already defined each term earlier in this lecture! Let's look at them term by term.</span>
+<span id="cb1-308"><a href="#cb1-308" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-309"><a href="#cb1-309" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-310"><a href="#cb1-310" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Observation Variance</span></span>
+<span id="cb1-311"><a href="#cb1-311" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-312"><a href="#cb1-312" aria-hidden="true" tabindex="-1"></a>The first term in the above decomposition is $E<span class="co">[</span><span class="ot">\epsilon^2</span><span class="co">]</span>$. Remember $\epsilon$ is the random noise when observing $Y$, with expectation $\mathbb{E}(\epsilon)=0$ and variance $\text{Var}(\epsilon) = \sigma^2$. We can show that $E<span class="co">[</span><span class="ot">\epsilon^2</span><span class="co">]</span>$ is the variance of $\epsilon$:</span>
+<span id="cb1-313"><a href="#cb1-313" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-314"><a href="#cb1-314" aria-hidden="true" tabindex="-1"></a>\begin{align*}</span>
+<span id="cb1-315"><a href="#cb1-315" aria-hidden="true" tabindex="-1"></a>\text{Var}(\epsilon) &amp;= E<span class="co">[</span><span class="ot">\epsilon^2</span><span class="co">]</span> + \left(E<span class="co">[</span><span class="ot">\epsilon</span><span class="co">]</span>\right)^2<span class="sc">\\</span></span>
+<span id="cb1-316"><a href="#cb1-316" aria-hidden="true" tabindex="-1"></a>&amp;= E<span class="co">[</span><span class="ot">\epsilon^2</span><span class="co">]</span> + 0^2<span class="sc">\\</span></span>
+<span id="cb1-317"><a href="#cb1-317" aria-hidden="true" tabindex="-1"></a>&amp;= \sigma^2.</span>
+<span id="cb1-318"><a href="#cb1-318" aria-hidden="true" tabindex="-1"></a>\end{align*}</span>
+<span id="cb1-319"><a href="#cb1-319" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-320"><a href="#cb1-320" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-321"><a href="#cb1-321" aria-hidden="true" tabindex="-1"></a>This term describes how variable the random error $\epsilon$ (and $Y$) is for each observation. This is called the **observation variance**. It exists due to the randomness in our observations $Y$. It is a form of *chance error* we talked about in the Sampling lecture.</span>
+<span id="cb1-322"><a href="#cb1-322" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-323"><a href="#cb1-323" aria-hidden="true" tabindex="-1"></a>$$\text{observation variance} = \text{Var}(\epsilon) = \sigma^2.$$</span>
+<span id="cb1-324"><a href="#cb1-324" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-325"><a href="#cb1-325" aria-hidden="true" tabindex="-1"></a>The observation variance results from measurement errors when observing data or missing information that acts like noise. To reduce this observation variance, we could try to get more precise measurements, but it is often beyond the control of data scientists. Because of this, the observation variance $\sigma^2$ is sometimes called "irreducible error."</span>
+<span id="cb1-326"><a href="#cb1-326" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-327"><a href="#cb1-327" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Model Variance</span></span>
+<span id="cb1-328"><a href="#cb1-328" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-329"><a href="#cb1-329" aria-hidden="true" tabindex="-1"></a>We will then look at the last term: $E\left<span class="co">[</span><span class="ot">\left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)^2\right</span><span class="co">]</span>$. If you recall the definition of variance from the last lecture, this is precisely $\text{Var}(\hat{Y}(x))$. We call this the **model variance**.</span>
+<span id="cb1-330"><a href="#cb1-330" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-331"><a href="#cb1-331" aria-hidden="true" tabindex="-1"></a>It describes how much the prediction $\hat{Y}(x)$ tends to vary when we fit the model on different samples. Remember the sample we collect can come out very differently, thus the prediction $\hat{Y}(x)$ will also be different. The model variance describes this variability due to the randomness in our sampling process. Like observation variance, it is also a form of *chance error*—even though the sources of randomness are different.</span>
+<span id="cb1-332"><a href="#cb1-332" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-333"><a href="#cb1-333" aria-hidden="true" tabindex="-1"></a>$$\text{model variance} = \text{Var}(\hat{Y}(x)) = E\left<span class="co">[</span><span class="ot">\left(\hat{Y}(x) - E\left[\hat{Y}(x)\right]\right)^2\right</span><span class="co">]</span>$$</span>
+<span id="cb1-334"><a href="#cb1-334" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-335"><a href="#cb1-335" aria-hidden="true" tabindex="-1"></a>The main reason for the large model variance is because of **overfitting**: we paid too much attention to the details in our sample that small differences in our random sample lead to large differences in the fitted model. To remediate this, we try to reduce model complexity (e.g. take out some features and limit the magnitude of estimated model coefficients) and not fit our model on the noises.</span>
+<span id="cb1-336"><a href="#cb1-336" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-337"><a href="#cb1-337" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Model Bias</span></span>
+<span id="cb1-338"><a href="#cb1-338" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-339"><a href="#cb1-339" aria-hidden="true" tabindex="-1"></a>Finally, the second term is $\left(g(x)-E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span>\right)^2$. What is this? The term $E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span> - g(x)$ is called the **model bias**.</span>
+<span id="cb1-340"><a href="#cb1-340" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-341"><a href="#cb1-341" aria-hidden="true" tabindex="-1"></a>Remember that $g(x)$ is the fixed underlying truth and $\hat{Y}(x)$ is our fitted model, which is random. Model bias therefore measures how far off $g(x)$ and $\hat{Y}(x)$ are on average over all possible samples.</span>
+<span id="cb1-342"><a href="#cb1-342" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-343"><a href="#cb1-343" aria-hidden="true" tabindex="-1"></a>$$\text{model bias} = E\left<span class="co">[</span><span class="ot">\hat{Y}(x) - g(x)\right</span><span class="co">]</span> = E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span> - g(x)$$</span>
+<span id="cb1-344"><a href="#cb1-344" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-345"><a href="#cb1-345" aria-hidden="true" tabindex="-1"></a>The model bias is not random; it's an average measure for a specific individual $x$. If bias is positive, our model tends to overestimate $g(x)$; if it's negative, our model tends to underestimate $g(x)$. And if it's 0, we can say that our model is **unbiased**.</span>
+<span id="cb1-346"><a href="#cb1-346" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-347"><a href="#cb1-347" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip}</span>
+<span id="cb1-348"><a href="#cb1-348" aria-hidden="true" tabindex="-1"></a><span class="fu">##### Unbiased Estimators </span></span>
+<span id="cb1-349"><a href="#cb1-349" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-350"><a href="#cb1-350" aria-hidden="true" tabindex="-1"></a>An **unbiased model** has a $\text{model bias } = 0$. In other words, our model predicts $g(x)$ on average. </span>
+<span id="cb1-351"><a href="#cb1-351" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-352"><a href="#cb1-352" aria-hidden="true" tabindex="-1"></a>Similarly, we can define bias for estimators like the mean. The sample mean is an **unbiased estimator** of the population mean, as by CLT, $\mathbb{E}<span class="co">[</span><span class="ot">\bar{X}_n</span><span class="co">]</span> = \mu$. Therefore, the $\text{estimator bias } = \mathbb{E}<span class="co">[</span><span class="ot">\bar{X}_n</span><span class="co">]</span> - \mu = 0$.</span>
+<span id="cb1-353"><a href="#cb1-353" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb1-354"><a href="#cb1-354" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-355"><a href="#cb1-355" aria-hidden="true" tabindex="-1"></a>There are two main reasons for large model biases:</span>
+<span id="cb1-356"><a href="#cb1-356" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-357"><a href="#cb1-357" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Underfitting: our model is too simple for the data</span>
+<span id="cb1-358"><a href="#cb1-358" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Lack of domain knowledge: we don't understand what features are useful for the response variable</span>
+<span id="cb1-359"><a href="#cb1-359" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-360"><a href="#cb1-360" aria-hidden="true" tabindex="-1"></a>To fix this, we increase model complexity (but we don't want to overfit!) or consult domain experts to see which models make sense. You can start to see a tradeoff here: if we increase model complexity, we decrease the model bias, but we also risk increasing the model variance.</span>
+<span id="cb1-361"><a href="#cb1-361" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-362"><a href="#cb1-362" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-363"><a href="#cb1-363" aria-hidden="true" tabindex="-1"></a><span class="fu">### The Decomposition</span></span>
+<span id="cb1-364"><a href="#cb1-364" aria-hidden="true" tabindex="-1"></a>To summarize: </span>
+<span id="cb1-365"><a href="#cb1-365" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-366"><a href="#cb1-366" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>The **model risk**, $\mathbb{E}\left[(Y(x)-\hat{Y}(x))^2\right]$, is the mean squared prediction error of the model. It is an expectation and is therefore a **fixed number** (for a given x).</span>
+<span id="cb1-367"><a href="#cb1-367" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>The **observation variance**, $\sigma^2$, is the variance of the random noise in the observations. It describes how variable the random error $\epsilon$ is for each observation and **cannot be addressed by modeling**.</span>
+<span id="cb1-368"><a href="#cb1-368" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>The **model bias**, $\mathbb{E}\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span>-g(x)$, is how "off" the $\hat{Y}(x)$ is as an estimator of the true underlying relationship $g(x)$. </span>
+<span id="cb1-369"><a href="#cb1-369" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>The **model variance**, $\text{Var}(\hat{Y}(x))$, describes how much the prediction $\hat{Y}(x)$ tends to vary when we fit the model on different samples. </span>
+<span id="cb1-370"><a href="#cb1-370" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-371"><a href="#cb1-371" aria-hidden="true" tabindex="-1"></a>The above definitions enable us to simplify the decomposition of model risk before as:</span>
+<span id="cb1-372"><a href="#cb1-372" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-373"><a href="#cb1-373" aria-hidden="true" tabindex="-1"></a>$$ E<span class="co">[</span><span class="ot">(Y(x) - \hat{Y}(x))^2</span><span class="co">]</span> = \sigma^2 + (E<span class="co">[</span><span class="ot">\hat{Y}(x)</span><span class="co">]</span> - g(x))^2 + \text{Var}(\hat{Y}(x)) $$</span>
+<span id="cb1-374"><a href="#cb1-374" aria-hidden="true" tabindex="-1"></a>$$\text{model risk } = \text{observation variance} + (\text{model bias})^2 \text{+ model variance}$$</span>
+<span id="cb1-375"><a href="#cb1-375" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-376"><a href="#cb1-376" aria-hidden="true" tabindex="-1"></a>This is known as the **bias-variance tradeoff**. What does it mean? Remember that the model risk is a measure of the model's performance. Our goal in building models is to keep model risk low; this means that we will want to ensure that each component of model risk is kept at a small value. </span>
+<span id="cb1-377"><a href="#cb1-377" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-378"><a href="#cb1-378" aria-hidden="true" tabindex="-1"></a>Observation variance is an inherent, random part of the data collection process. We aren't able to reduce the observation variance, so we'll focus our attention on the model bias and model variance. </span>
+<span id="cb1-379"><a href="#cb1-379" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-380"><a href="#cb1-380" aria-hidden="true" tabindex="-1"></a>In the Feature Engineering lecture, we considered the issue of overfitting. We saw that the model's error or bias tends to decrease as model complexity increases — if we design a highly complex model, it will tend to make predictions that are closer to the true relationship $g$. At the same time, model variance tends to *increase* as model complexity increases; a complex model may overfit to the training data, meaning that small differences in the random samples used for training lead to large differences in the fitted model. We have a problem. To decrease model bias, we could increase the model's complexity, which would lead to overfitting and an increase in model variance. Alternatively, we could decrease model variance by decreasing the model's complexity at the cost of increased model bias due to underfitting. </span>
+<span id="cb1-381"><a href="#cb1-381" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-382"><a href="#cb1-382" aria-hidden="true" tabindex="-1"></a>&lt;p align="center"&gt;</span>
+<span id="cb1-383"><a href="#cb1-383" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/bvt.png" alt='bvt' width='600'&gt;</span>
+<span id="cb1-384"><a href="#cb1-384" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb1-385"><a href="#cb1-385" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-386"><a href="#cb1-386" aria-hidden="true" tabindex="-1"></a>We need to strike a balance. Our goal in model creation is to use a complexity level that is high enough to keep bias low, but not so high that model variance is large.</span>
+<span id="cb1-387"><a href="#cb1-387" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-388"><a href="#cb1-388" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-389"><a href="#cb1-389" aria-hidden="true" tabindex="-1"></a><span class="fu">## [Bonus] Proof of Bias-Variance Decomposition</span></span>
+<span id="cb1-390"><a href="#cb1-390" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-391"><a href="#cb1-391" aria-hidden="true" tabindex="-1"></a>This section walks through the detailed derivation of the Bias-Variance Decomposition in the Bias-Variance Tradeoff section above, and this content is out of scope.</span>
+<span id="cb1-392"><a href="#cb1-392" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-393"><a href="#cb1-393" aria-hidden="true" tabindex="-1"></a>:::{.callout collapse="true"}</span>
+<span id="cb1-394"><a href="#cb1-394" aria-hidden="true" tabindex="-1"></a><span class="fu">### Click to show</span></span>
+<span id="cb1-395"><a href="#cb1-395" aria-hidden="true" tabindex="-1"></a>We want to prove that the model risk can be decomposed as</span>
+<span id="cb1-396"><a href="#cb1-396" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-397"><a href="#cb1-397" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-398"><a href="#cb1-398" aria-hidden="true" tabindex="-1"></a>\begin{align*}</span>
+<span id="cb1-399"><a href="#cb1-399" aria-hidden="true" tabindex="-1"></a>E\left<span class="co">[</span><span class="ot">(Y(x)-\hat{Y}(x))^2\right</span><span class="co">]</span> &amp;= E<span class="co">[</span><span class="ot">\epsilon^2</span><span class="co">]</span> + \left(g(x)-E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span>\right)^2 + E\left<span class="co">[</span><span class="ot">\left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)^2\right</span><span class="co">]</span>.</span>
+<span id="cb1-400"><a href="#cb1-400" aria-hidden="true" tabindex="-1"></a>\end{align*}</span>
+<span id="cb1-401"><a href="#cb1-401" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-402"><a href="#cb1-402" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-403"><a href="#cb1-403" aria-hidden="true" tabindex="-1"></a>To prove this, we will first need the following lemma:</span>
+<span id="cb1-404"><a href="#cb1-404" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-405"><a href="#cb1-405" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;If $V$ and $W$ are independent random variables then $E<span class="co">[</span><span class="ot">VW</span><span class="co">]</span> = E<span class="co">[</span><span class="ot">V</span><span class="co">]</span>E<span class="co">[</span><span class="ot">W</span><span class="co">]</span>$.&lt;/center&gt;</span>
+<span id="cb1-406"><a href="#cb1-406" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-407"><a href="#cb1-407" aria-hidden="true" tabindex="-1"></a>We will prove this in the discrete finite case. Trust that it's true in greater generality.</span>
+<span id="cb1-408"><a href="#cb1-408" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-409"><a href="#cb1-409" aria-hidden="true" tabindex="-1"></a>The job is to calculate the weighted average of the values of $VW$, where the weights are the probabilities of those values. Here goes.</span>
+<span id="cb1-410"><a href="#cb1-410" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-411"><a href="#cb1-411" aria-hidden="true" tabindex="-1"></a>\begin{align*}</span>
+<span id="cb1-412"><a href="#cb1-412" aria-hidden="true" tabindex="-1"></a>E<span class="co">[</span><span class="ot">VW</span><span class="co">]</span> ~ &amp;= ~ \sum_v\sum_w vwP(V=v \text{ and } W=w) <span class="sc">\\</span></span>
+<span id="cb1-413"><a href="#cb1-413" aria-hidden="true" tabindex="-1"></a>&amp;= ~ \sum_v\sum_w vwP(V=v)P(W=w) ~~~~ \text{by independence} <span class="sc">\\</span></span>
+<span id="cb1-414"><a href="#cb1-414" aria-hidden="true" tabindex="-1"></a>&amp;= ~ \sum_v vP(V=v)\sum_w wP(W=w) <span class="sc">\\</span></span>
+<span id="cb1-415"><a href="#cb1-415" aria-hidden="true" tabindex="-1"></a>&amp;= ~ E<span class="co">[</span><span class="ot">V</span><span class="co">]</span>E<span class="co">[</span><span class="ot">W</span><span class="co">]</span></span>
+<span id="cb1-416"><a href="#cb1-416" aria-hidden="true" tabindex="-1"></a>\end{align*}</span>
+<span id="cb1-417"><a href="#cb1-417" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-418"><a href="#cb1-418" aria-hidden="true" tabindex="-1"></a>Now we go into the actual proof:</span>
+<span id="cb1-419"><a href="#cb1-419" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-420"><a href="#cb1-420" aria-hidden="true" tabindex="-1"></a><span class="fu">### Goal</span></span>
+<span id="cb1-421"><a href="#cb1-421" aria-hidden="true" tabindex="-1"></a>Decompose the model risk into recognizable components.</span>
+<span id="cb1-422"><a href="#cb1-422" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-423"><a href="#cb1-423" aria-hidden="true" tabindex="-1"></a><span class="fu">### Step 1</span></span>
+<span id="cb1-424"><a href="#cb1-424" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-425"><a href="#cb1-425" aria-hidden="true" tabindex="-1"></a>\begin{align*}</span>
+<span id="cb1-426"><a href="#cb1-426" aria-hidden="true" tabindex="-1"></a>\text{model risk} ~ &amp;= ~ E\left<span class="co">[</span><span class="ot">\left(Y - \hat{Y}(x)\right)^2 \right</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-427"><a href="#cb1-427" aria-hidden="true" tabindex="-1"></a>&amp;= ~ E\left<span class="co">[</span><span class="ot">\left(g(x) + \epsilon - \hat{Y}(x)\right)^2 \right</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-428"><a href="#cb1-428" aria-hidden="true" tabindex="-1"></a>&amp;= ~ E\left<span class="co">[</span><span class="ot">\left(\epsilon + \left(g(x)- \hat{Y}(x)\right)\right)^2 \right</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-429"><a href="#cb1-429" aria-hidden="true" tabindex="-1"></a>&amp;= ~ E\left<span class="co">[</span><span class="ot">\epsilon^2\right</span><span class="co">]</span> + 2E\left<span class="co">[</span><span class="ot">\epsilon \left(g(x)- \hat{Y}(x)\right)\right</span><span class="co">]</span> + E\left<span class="co">[</span><span class="ot">\left(g(x) - \hat{Y}(x)\right)^2\right</span><span class="co">]</span><span class="sc">\\</span></span>
+<span id="cb1-430"><a href="#cb1-430" aria-hidden="true" tabindex="-1"></a>\end{align*}</span>
+<span id="cb1-431"><a href="#cb1-431" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-432"><a href="#cb1-432" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-433"><a href="#cb1-433" aria-hidden="true" tabindex="-1"></a>On the right hand side: </span>
+<span id="cb1-434"><a href="#cb1-434" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-435"><a href="#cb1-435" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The first term is the observation variance $\sigma^2$.</span>
+<span id="cb1-436"><a href="#cb1-436" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The cross product term is 0 because $\epsilon$ is independent of $g(x) - \hat{Y}(x)$ and $E(\epsilon) = 0$</span>
+<span id="cb1-437"><a href="#cb1-437" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The last term is the mean squared difference between our predicted value and the value of the true function at $x$</span>
+<span id="cb1-438"><a href="#cb1-438" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-439"><a href="#cb1-439" aria-hidden="true" tabindex="-1"></a><span class="fu">### Step 2</span></span>
+<span id="cb1-440"><a href="#cb1-440" aria-hidden="true" tabindex="-1"></a>At this stage we have</span>
+<span id="cb1-441"><a href="#cb1-441" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-442"><a href="#cb1-442" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-443"><a href="#cb1-443" aria-hidden="true" tabindex="-1"></a>\text{model risk} ~ = ~ E\left<span class="co">[</span><span class="ot">\epsilon^2\right</span><span class="co">]</span> + E\left<span class="co">[</span><span class="ot">\left(g(x) - \hat{Y}(x)\right)^2\right</span><span class="co">]</span></span>
+<span id="cb1-444"><a href="#cb1-444" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-445"><a href="#cb1-445" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-446"><a href="#cb1-446" aria-hidden="true" tabindex="-1"></a>We don't yet have a good understanding of $g(x) - \hat{Y}(x)$. But we do understand the deviation $D_{\hat{Y}(x)} = \hat{Y}(x) - E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span>$. We know that</span>
+<span id="cb1-447"><a href="#cb1-447" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-448"><a href="#cb1-448" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>$E\left<span class="co">[</span><span class="ot">D_{\hat{Y}(x)}\right</span><span class="co">]</span> ~ = ~ 0$</span>
+<span id="cb1-449"><a href="#cb1-449" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>$E\left<span class="co">[</span><span class="ot">D_{\hat{Y}(x)}^2\right</span><span class="co">]</span> ~ = ~ \text{model variance}$</span>
+<span id="cb1-450"><a href="#cb1-450" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-451"><a href="#cb1-451" aria-hidden="true" tabindex="-1"></a>So let's add and subtract $E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span>$ and see if that helps.</span>
+<span id="cb1-452"><a href="#cb1-452" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-453"><a href="#cb1-453" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-454"><a href="#cb1-454" aria-hidden="true" tabindex="-1"></a>g(x) - \hat{Y}(x) ~ = ~ \left(g(x) - E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span> \right) + \left(E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span> - \hat{Y}(x)\right) </span>
+<span id="cb1-455"><a href="#cb1-455" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-456"><a href="#cb1-456" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-457"><a href="#cb1-457" aria-hidden="true" tabindex="-1"></a>The first term on the right hand side is the model bias at $x$. The second term is $-D_{\hat{Y}(x)}$. So</span>
+<span id="cb1-458"><a href="#cb1-458" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-459"><a href="#cb1-459" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-460"><a href="#cb1-460" aria-hidden="true" tabindex="-1"></a>g(x) - \hat{Y}(x) ~ = ~ \text{model bias} - D_{\hat{Y}(x)}</span>
+<span id="cb1-461"><a href="#cb1-461" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-462"><a href="#cb1-462" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-463"><a href="#cb1-463" aria-hidden="true" tabindex="-1"></a><span class="fu">### Step 3</span></span>
+<span id="cb1-464"><a href="#cb1-464" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-465"><a href="#cb1-465" aria-hidden="true" tabindex="-1"></a>Remember that the model bias at $x$ is a constant, not a random variable. Think of it as your favorite number, say 10. Then </span>
+<span id="cb1-466"><a href="#cb1-466" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-467"><a href="#cb1-467" aria-hidden="true" tabindex="-1"></a>\begin{align*}</span>
+<span id="cb1-468"><a href="#cb1-468" aria-hidden="true" tabindex="-1"></a>E\left<span class="co">[</span><span class="ot"> \left(g(x) - \hat{Y}(x)\right)^2 \right</span><span class="co">]</span> ~ &amp;= ~ \text{model bias}^2 - 2(\text{model bias})E\left<span class="co">[</span><span class="ot">D_{\hat{Y}(x)}\right</span><span class="co">]</span> + E\left<span class="co">[</span><span class="ot">D_{\hat{Y}(x)}^2\right</span><span class="co">]</span> <span class="sc">\\</span></span>
+<span id="cb1-469"><a href="#cb1-469" aria-hidden="true" tabindex="-1"></a>&amp;= ~ \text{model bias}^2 - 0 + \text{model variance} <span class="sc">\\</span></span>
+<span id="cb1-470"><a href="#cb1-470" aria-hidden="true" tabindex="-1"></a>&amp;= ~ \text{model bias}^2 + \text{model variance}</span>
+<span id="cb1-471"><a href="#cb1-471" aria-hidden="true" tabindex="-1"></a>\end{align*}</span>
+<span id="cb1-472"><a href="#cb1-472" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-473"><a href="#cb1-473" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-474"><a href="#cb1-474" aria-hidden="true" tabindex="-1"></a>Again, the cross-product term is $0$ because $E\left<span class="co">[</span><span class="ot">D_{\hat{Y}(x)}\right</span><span class="co">]</span> ~ = ~ 0$.</span>
+<span id="cb1-475"><a href="#cb1-475" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-476"><a href="#cb1-476" aria-hidden="true" tabindex="-1"></a><span class="fu">### Step 4: Bias-Variance Decomposition</span></span>
+<span id="cb1-477"><a href="#cb1-477" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-478"><a href="#cb1-478" aria-hidden="true" tabindex="-1"></a>In Step 2, we had:</span>
+<span id="cb1-479"><a href="#cb1-479" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-480"><a href="#cb1-480" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-481"><a href="#cb1-481" aria-hidden="true" tabindex="-1"></a>\text{model risk} ~ = ~ \text{observation variance} + E\left<span class="co">[</span><span class="ot">\left(g(x) - \hat{Y}(x)\right)^2\right</span><span class="co">]</span></span>
+<span id="cb1-482"><a href="#cb1-482" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-483"><a href="#cb1-483" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-484"><a href="#cb1-484" aria-hidden="true" tabindex="-1"></a>Step 3 showed:</span>
+<span id="cb1-485"><a href="#cb1-485" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-486"><a href="#cb1-486" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-487"><a href="#cb1-487" aria-hidden="true" tabindex="-1"></a>E\left<span class="co">[</span><span class="ot"> \left(g(x) - \hat{Y}(x)\right)^2 \right</span><span class="co">]</span> ~ = ~ \text{model bias}^2 + \text{model variance}</span>
+<span id="cb1-488"><a href="#cb1-488" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-489"><a href="#cb1-489" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-490"><a href="#cb1-490" aria-hidden="true" tabindex="-1"></a>Thus, we have proven the bias-variance decomposition:</span>
+<span id="cb1-491"><a href="#cb1-491" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-492"><a href="#cb1-492" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-493"><a href="#cb1-493" aria-hidden="true" tabindex="-1"></a>\text{model risk} = \text{observation variance} + \text{model bias}^2 + \text{model variance}.</span>
+<span id="cb1-494"><a href="#cb1-494" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-495"><a href="#cb1-495" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-496"><a href="#cb1-496" aria-hidden="true" tabindex="-1"></a>That is,</span>
+<span id="cb1-497"><a href="#cb1-497" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-498"><a href="#cb1-498" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-499"><a href="#cb1-499" aria-hidden="true" tabindex="-1"></a>E\left<span class="co">[</span><span class="ot">(Y(x)-\hat{Y}(x))^2\right</span><span class="co">]</span> = \sigma^2 + \left(E\left<span class="co">[</span><span class="ot">\hat{Y}(x)\right</span><span class="co">]</span> - g(x)\right)^2 + E\left<span class="co">[</span><span class="ot">\left(\hat{Y}(x)-E\left[\hat{Y}(x)\right]\right)^2\right</span><span class="co">]</span></span>
+<span id="cb1-500"><a href="#cb1-500" aria-hidden="true" tabindex="-1"></a>$$</span>
+<span id="cb1-501"><a href="#cb1-501" aria-hidden="true" tabindex="-1"></a>:::</span></code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/regex/regex.html b/docs/regex/regex.html
new file mode 100644
index 000000000..df18ee2cd
--- /dev/null
+++ b/docs/regex/regex.html
@@ -0,0 +1,1982 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>6&nbsp; Regular Expressions – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../visualization_1/visualization_1.html" rel="next">
+<link href="../eda/eda.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../regex/regex.html"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#why-work-with-text" id="toc-why-work-with-text" class="nav-link active" data-scroll-target="#why-work-with-text"><span class="header-section-number">6.1</span> Why Work with Text?</a></li>
+  <li><a href="#python-string-methods" id="toc-python-string-methods" class="nav-link" data-scroll-target="#python-string-methods"><span class="header-section-number">6.2</span> Python String Methods</a>
+  <ul>
+  <li><a href="#canonicalization" id="toc-canonicalization" class="nav-link" data-scroll-target="#canonicalization"><span class="header-section-number">6.2.1</span> Canonicalization</a>
+  <ul>
+  <li><a href="#canonicalization-with-python-string-manipulation" id="toc-canonicalization-with-python-string-manipulation" class="nav-link" data-scroll-target="#canonicalization-with-python-string-manipulation"><span class="header-section-number">6.2.1.1</span> Canonicalization with Python String Manipulation</a></li>
+  <li><a href="#canonicalization-with-pandas-series-methods" id="toc-canonicalization-with-pandas-series-methods" class="nav-link" data-scroll-target="#canonicalization-with-pandas-series-methods"><span class="header-section-number">6.2.1.2</span> Canonicalization with Pandas Series Methods</a></li>
+  </ul></li>
+  <li><a href="#extraction" id="toc-extraction" class="nav-link" data-scroll-target="#extraction"><span class="header-section-number">6.2.2</span> Extraction</a></li>
+  </ul></li>
+  <li><a href="#regex-basics" id="toc-regex-basics" class="nav-link" data-scroll-target="#regex-basics"><span class="header-section-number">6.3</span> RegEx Basics</a>
+  <ul>
+  <li><a href="#basics-regex-syntax" id="toc-basics-regex-syntax" class="nav-link" data-scroll-target="#basics-regex-syntax"><span class="header-section-number">6.3.1</span> Basics RegEx Syntax</a>
+  <ul>
+  <li><a href="#examples" id="toc-examples" class="nav-link" data-scroll-target="#examples"><span class="header-section-number">6.3.1.1</span> Examples</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#regex-expanded" id="toc-regex-expanded" class="nav-link" data-scroll-target="#regex-expanded"><span class="header-section-number">6.4</span> RegEx Expanded</a>
+  <ul>
+  <li><a href="#examples-1" id="toc-examples-1" class="nav-link" data-scroll-target="#examples-1"><span class="header-section-number">6.4.0.1</span> Examples</a></li>
+  </ul></li>
+  <li><a href="#convenient-regex" id="toc-convenient-regex" class="nav-link" data-scroll-target="#convenient-regex"><span class="header-section-number">6.5</span> Convenient RegEx</a>
+  <ul>
+  <li><a href="#greediness" id="toc-greediness" class="nav-link" data-scroll-target="#greediness"><span class="header-section-number">6.5.1</span> Greediness</a></li>
+  <li><a href="#examples-2" id="toc-examples-2" class="nav-link" data-scroll-target="#examples-2"><span class="header-section-number">6.5.2</span> Examples</a></li>
+  </ul></li>
+  <li><a href="#regex-in-python-and-pandas-regex-groups" id="toc-regex-in-python-and-pandas-regex-groups" class="nav-link" data-scroll-target="#regex-in-python-and-pandas-regex-groups"><span class="header-section-number">6.6</span> Regex in Python and Pandas (RegEx Groups)</a>
+  <ul>
+  <li><a href="#canonicalization-1" id="toc-canonicalization-1" class="nav-link" data-scroll-target="#canonicalization-1"><span class="header-section-number">6.6.1</span> Canonicalization</a>
+  <ul>
+  <li><a href="#canonicalization-with-regex" id="toc-canonicalization-with-regex" class="nav-link" data-scroll-target="#canonicalization-with-regex"><span class="header-section-number">6.6.1.1</span> Canonicalization with RegEx</a></li>
+  <li><a href="#canonicalization-with-pandas" id="toc-canonicalization-with-pandas" class="nav-link" data-scroll-target="#canonicalization-with-pandas"><span class="header-section-number">6.6.1.2</span> Canonicalization with <code>pandas</code></a></li>
+  </ul></li>
+  <li><a href="#extraction-1" id="toc-extraction-1" class="nav-link" data-scroll-target="#extraction-1"><span class="header-section-number">6.6.2</span> Extraction</a>
+  <ul>
+  <li><a href="#extraction-with-regex" id="toc-extraction-with-regex" class="nav-link" data-scroll-target="#extraction-with-regex"><span class="header-section-number">6.6.2.1</span> Extraction with RegEx</a></li>
+  <li><a href="#extraction-with-pandas" id="toc-extraction-with-pandas" class="nav-link" data-scroll-target="#extraction-with-pandas"><span class="header-section-number">6.6.2.2</span> Extraction with <code>pandas</code></a></li>
+  </ul></li>
+  <li><a href="#regular-expression-capture-groups" id="toc-regular-expression-capture-groups" class="nav-link" data-scroll-target="#regular-expression-capture-groups"><span class="header-section-number">6.6.3</span> Regular Expression Capture Groups</a>
+  <ul>
+  <li><a href="#example-1" id="toc-example-1" class="nav-link" data-scroll-target="#example-1"><span class="header-section-number">6.6.3.1</span> Example 1</a></li>
+  <li><a href="#example-2" id="toc-example-2" class="nav-link" data-scroll-target="#example-2"><span class="header-section-number">6.6.3.2</span> Example 2</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#limitations-of-regular-expressions" id="toc-limitations-of-regular-expressions" class="nav-link" data-scroll-target="#limitations-of-regular-expressions"><span class="header-section-number">6.7</span> Limitations of Regular Expressions</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Understand Python string manipulation, <code>pandas</code> <code>Series</code> methods</li>
+<li>Parse and create regex, with a reference table</li>
+<li>Use vocabulary (closure, metacharacters, groups, etc.) to describe regex metacharacters</li>
+</ul>
+</div>
+</div>
+</div>
+<section id="why-work-with-text" class="level2" data-number="6.1">
+<h2 data-number="6.1" class="anchored" data-anchor-id="why-work-with-text"><span class="header-section-number">6.1</span> Why Work with Text?</h2>
+<p>Last lecture, we learned of the difference between quantitative and qualitative variable types. The latter includes string data — the primary focus of lecture 6. In this note, we’ll discuss the necessary tools to manipulate text: Python string manipulation and regular expressions.</p>
+<p>There are two main reasons for working with text.</p>
+<ol type="1">
+<li>Canonicalization: Convert data that has multiple formats into a standard form.
+<ul>
+<li>By manipulating text, we can join tables with mismatched string labels.</li>
+</ul></li>
+<li>Extract information into a new feature.
+<ul>
+<li>For example, we can extract date and time features from text.</li>
+</ul></li>
+</ol>
+</section>
+<section id="python-string-methods" class="level2" data-number="6.2">
+<h2 data-number="6.2" class="anchored" data-anchor-id="python-string-methods"><span class="header-section-number">6.2</span> Python String Methods</h2>
+<p>First, we’ll introduce a few methods useful for string manipulation. The following table includes a number of string operations supported by Python and <code>pandas</code>. The Python functions operate on a single string, while their equivalent in <code>pandas</code> are <strong>vectorized</strong> — they operate on a <code>Series</code> of string data.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 33%">
+<col style="width: 25%">
+<col style="width: 38%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Operation</th>
+<th>Python</th>
+<th><code>Pandas</code> (<code>Series</code>)</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>Transformation</td>
+<td><ul>
+<li><code>s.lower()</code></li>
+<li><code>s.upper()</code></li>
+</ul></td>
+<td><ul>
+<li><code>ser.str.lower()</code></li>
+<li><code>ser.str.upper()</code></li>
+</ul></td>
+</tr>
+<tr class="even">
+<td>Replacement + Deletion</td>
+<td><ul>
+<li><code>s.replace(_)</code></li>
+</ul></td>
+<td><ul>
+<li><code>ser.str.replace(_)</code></li>
+</ul></td>
+</tr>
+<tr class="odd">
+<td>Split</td>
+<td><ul>
+<li><code>s.split(_)</code></li>
+</ul></td>
+<td><ul>
+<li><code>ser.str.split(_)</code></li>
+</ul></td>
+</tr>
+<tr class="even">
+<td>Substring</td>
+<td><ul>
+<li><code>s[1:4]</code></li>
+</ul></td>
+<td><ul>
+<li><code>ser.str[1:4]</code></li>
+</ul></td>
+</tr>
+<tr class="odd">
+<td>Membership</td>
+<td><ul>
+<li><code>'_' in s</code></li>
+</ul></td>
+<td><ul>
+<li><code>ser.str.contains(_)</code></li>
+</ul></td>
+</tr>
+<tr class="even">
+<td>Length</td>
+<td><ul>
+<li><code>len(s)</code></li>
+</ul></td>
+<td><ul>
+<li><code>ser.str.len()</code></li>
+</ul></td>
+</tr>
+</tbody>
+</table>
+<p>We’ll discuss the differences between Python string functions and <code>pandas</code> <code>Series</code> methods in the following section on canonicalization.</p>
+<section id="canonicalization" class="level3" data-number="6.2.1">
+<h3 data-number="6.2.1" class="anchored" data-anchor-id="canonicalization"><span class="header-section-number">6.2.1</span> Canonicalization</h3>
+<p>Assume we want to merge the given tables.</p>
+<div id="5aab56e4" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">'data/county_and_state.csv'</span>) <span class="im">as</span> f:</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>    county_and_state <span class="op">=</span> pd.read_csv(f)</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">'data/county_and_population.csv'</span>) <span class="im">as</span> f:</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>    county_and_pop <span class="op">=</span> pd.read_csv(f)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="186070d8" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>display(county_and_state), display(county_and_pop)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">County</th>
+<th data-quarto-table-cell-role="th">State</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>De Witt County</td>
+<td>IL</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Lac qui Parle County</td>
+<td>MN</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Lewis and Clark County</td>
+<td>MT</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>St John the Baptist Parish</td>
+<td>LS</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">County</th>
+<th data-quarto-table-cell-role="th">Population</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>DeWitt</td>
+<td>16798</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Lac Qui Parle</td>
+<td>8067</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Lewis &amp; Clark</td>
+<td>55716</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>St. John the Baptist</td>
+<td>43044</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>Last time, we used a <strong>primary key</strong> and <strong>foreign key</strong> to join two tables. While neither of these keys exist in our <code>DataFrame</code>s, the <code>"County"</code> columns look similar enough. Can we convert these columns into one standard, canonical form to merge the two tables?</p>
+<section id="canonicalization-with-python-string-manipulation" class="level4" data-number="6.2.1.1">
+<h4 data-number="6.2.1.1" class="anchored" data-anchor-id="canonicalization-with-python-string-manipulation"><span class="header-section-number">6.2.1.1</span> Canonicalization with Python String Manipulation</h4>
+<p>The following function uses Python string manipulation to convert a single county name into canonical form. It does so by eliminating whitespace, punctuation, and unnecessary text.</p>
+<div id="3f8b9429" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> canonicalize_county(county_name):</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>        county_name</span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>            .lower()</span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>            .replace(<span class="st">' '</span>, <span class="st">''</span>)</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>            .replace(<span class="st">'&amp;'</span>, <span class="st">'and'</span>)</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>            .replace(<span class="st">'.'</span>, <span class="st">''</span>)</span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>            .replace(<span class="st">'county'</span>, <span class="st">''</span>)</span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>            .replace(<span class="st">'parish'</span>, <span class="st">''</span>)</span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>canonicalize_county(<span class="st">"St. John the Baptist"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<pre><code>'stjohnthebaptist'</code></pre>
+</div>
+</div>
+<p>We will use the <code>pandas</code> <code>map</code> function to apply the <code>canonicalize_county</code> function to every row in both <code>DataFrame</code>s. In doing so, we’ll create a new column in each called <code>clean_county_python</code> with the canonical form.</p>
+<div id="66b73950" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>county_and_pop[<span class="st">'clean_county_python'</span>] <span class="op">=</span> county_and_pop[<span class="st">'County'</span>].<span class="bu">map</span>(canonicalize_county)</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>county_and_state[<span class="st">'clean_county_python'</span>] <span class="op">=</span> county_and_state[<span class="st">'County'</span>].<span class="bu">map</span>(canonicalize_county)</span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>display(county_and_state), display(county_and_pop)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">County</th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">clean_county_python</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>De Witt County</td>
+<td>IL</td>
+<td>dewitt</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Lac qui Parle County</td>
+<td>MN</td>
+<td>lacquiparle</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Lewis and Clark County</td>
+<td>MT</td>
+<td>lewisandclark</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>St John the Baptist Parish</td>
+<td>LS</td>
+<td>stjohnthebaptist</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">County</th>
+<th data-quarto-table-cell-role="th">Population</th>
+<th data-quarto-table-cell-role="th">clean_county_python</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>DeWitt</td>
+<td>16798</td>
+<td>dewitt</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Lac Qui Parle</td>
+<td>8067</td>
+<td>lacquiparle</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Lewis &amp; Clark</td>
+<td>55716</td>
+<td>lewisandclark</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>St. John the Baptist</td>
+<td>43044</td>
+<td>stjohnthebaptist</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+<section id="canonicalization-with-pandas-series-methods" class="level4" data-number="6.2.1.2">
+<h4 data-number="6.2.1.2" class="anchored" data-anchor-id="canonicalization-with-pandas-series-methods"><span class="header-section-number">6.2.1.2</span> Canonicalization with Pandas Series Methods</h4>
+<p>Alternatively, we can use <code>pandas</code> <code>Series</code> methods to create this standardized column. To do so, we must call the <code>.str</code> attribute of our <code>Series</code> object prior to calling any methods, like <code>.lower</code> and <code>.replace</code>. Notice how these method names match their equivalent built-in Python string functions.</p>
+<p>Chaining multiple <code>Series</code> methods in this manner eliminates the need to use the <code>map</code> function (as this code is vectorized).</p>
+<div id="3e64fa34" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> canonicalize_county_series(county_series):</span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (</span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>        county_series</span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>            .<span class="bu">str</span>.lower()</span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>            .<span class="bu">str</span>.replace(<span class="st">' '</span>, <span class="st">''</span>)</span>
+<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>            .<span class="bu">str</span>.replace(<span class="st">'&amp;'</span>, <span class="st">'and'</span>)</span>
+<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>            .<span class="bu">str</span>.replace(<span class="st">'.'</span>, <span class="st">''</span>)</span>
+<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>            .<span class="bu">str</span>.replace(<span class="st">'county'</span>, <span class="st">''</span>)</span>
+<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a>            .<span class="bu">str</span>.replace(<span class="st">'parish'</span>, <span class="st">''</span>)</span>
+<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a>county_and_pop[<span class="st">'clean_county_pandas'</span>] <span class="op">=</span> canonicalize_county_series(county_and_pop[<span class="st">'County'</span>])</span>
+<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a>county_and_state[<span class="st">'clean_county_pandas'</span>] <span class="op">=</span> canonicalize_county_series(county_and_state[<span class="st">'County'</span>])</span>
+<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a>display(county_and_pop), display(county_and_state)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">County</th>
+<th data-quarto-table-cell-role="th">Population</th>
+<th data-quarto-table-cell-role="th">clean_county_python</th>
+<th data-quarto-table-cell-role="th">clean_county_pandas</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>DeWitt</td>
+<td>16798</td>
+<td>dewitt</td>
+<td>dewitt</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Lac Qui Parle</td>
+<td>8067</td>
+<td>lacquiparle</td>
+<td>lacquiparle</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Lewis &amp; Clark</td>
+<td>55716</td>
+<td>lewisandclark</td>
+<td>lewisandclark</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>St. John the Baptist</td>
+<td>43044</td>
+<td>stjohnthebaptist</td>
+<td>stjohnthebaptist</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">County</th>
+<th data-quarto-table-cell-role="th">State</th>
+<th data-quarto-table-cell-role="th">clean_county_python</th>
+<th data-quarto-table-cell-role="th">clean_county_pandas</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>De Witt County</td>
+<td>IL</td>
+<td>dewitt</td>
+<td>dewitt</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Lac qui Parle County</td>
+<td>MN</td>
+<td>lacquiparle</td>
+<td>lacquiparle</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Lewis and Clark County</td>
+<td>MT</td>
+<td>lewisandclark</td>
+<td>lewisandclark</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>St John the Baptist Parish</td>
+<td>LS</td>
+<td>stjohnthebaptist</td>
+<td>stjohnthebaptist</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="extraction" class="level3" data-number="6.2.2">
+<h3 data-number="6.2.2" class="anchored" data-anchor-id="extraction"><span class="header-section-number">6.2.2</span> Extraction</h3>
+<p>Extraction explores the idea of obtaining useful information from text data. This will be particularily important in model building, which we’ll study in a few weeks.</p>
+<p>Say we want to read some data from a <code>.txt</code> file.</p>
+<div id="87cb20b3" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">'data/log.txt'</span>, <span class="st">'r'</span>) <span class="im">as</span> f:</span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>    log_lines <span class="op">=</span> f.readlines()</span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>log_lines</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<pre><code>['169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n',
+ '193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"\n',
+ '169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"\n']</code></pre>
+</div>
+</div>
+<p>Suppose we want to extract the day, month, year, hour, minutes, seconds, and time zone. Unfortunately, these items are not in a fixed position from the beginning of the string, so slicing by some fixed offset won’t work.</p>
+<p>Instead, we can use some clever thinking. Notice how the relevant information is contained within a set of brackets, further separated by <code>/</code> and <code>:</code>. We can hone in on this region of text, and split the data on these characters. Python’s built-in <code>.split</code> function makes this easy.</p>
+<div id="e6a4e34e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>first <span class="op">=</span> log_lines[<span class="dv">0</span>] <span class="co"># Only considering the first row of data</span></span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>pertinent <span class="op">=</span> first.split(<span class="st">"["</span>)[<span class="dv">1</span>].split(<span class="st">']'</span>)[<span class="dv">0</span>]</span>
+<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>day, month, rest <span class="op">=</span> pertinent.split(<span class="st">'/'</span>)</span>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>year, hour, minute, rest <span class="op">=</span> rest.split(<span class="st">':'</span>)</span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>seconds, time_zone <span class="op">=</span> rest.split(<span class="st">' '</span>)</span>
+<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>day, month, year, hour, minute, seconds, time_zone</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<pre><code>('26', 'Jan', '2014', '10', '47', '58', '-0800')</code></pre>
+</div>
+</div>
+<p>There are two problems with this code:</p>
+<ol type="1">
+<li>Python’s built-in functions limit us to extract data one record at a time,
+<ul>
+<li>This can be resolved using the <code>map</code> function or <code>pandas</code> <code>Series</code> methods.</li>
+</ul></li>
+<li>The code is quite verbose.
+<ul>
+<li>This is a larger issue that is trickier to solve</li>
+</ul></li>
+</ol>
+<p>In the next section, we’ll introduce regular expressions - a tool that solves problem 2.</p>
+</section>
+</section>
+<section id="regex-basics" class="level2" data-number="6.3">
+<h2 data-number="6.3" class="anchored" data-anchor-id="regex-basics"><span class="header-section-number">6.3</span> RegEx Basics</h2>
+<p>A <strong>regular expression (“RegEx”)</strong> is a sequence of characters that specifies a search pattern. They are written to extract specific information from text. Regular expressions are essentially part of a smaller programming language embedded in Python, made available through the <code>re</code> module. As such, they have a stand-alone syntax and methods for various capabilities.</p>
+<p>Regular expressions are useful in many applications beyond data science. For example, Social Security Numbers (SSNs) are often validated with regular expressions.</p>
+<div id="6167aace" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co">r"[0-9]{3}-[0-9]{2}-[0-9]{4}"</span> <span class="co"># Regular Expression Syntax</span></span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a><span class="co"># 3 of any digit, then a dash,</span></span>
+<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a><span class="co"># then 2 of any digit, then a dash,</span></span>
+<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="co"># then 4 of any digit</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<pre><code>'[0-9]{3}-[0-9]{2}-[0-9]{4}'</code></pre>
+</div>
+</div>
+<!-- The goal of today is NOT to memorize regex. At a high level, we want you to:
+
+1. Understand what regex is capable of
+2. Parse and create regex, given a reference table -->
+<p>There are a ton of resources to learn and experiment with regular expressions. A few are provided below:</p>
+<ul>
+<li><a href="https://docs.python.org/3/howto/regex.html">Official Regex Guide</a></li>
+<li><a href="https://ds100.org/sp22/resources/assets/hw/regex_reference.pdf">Data 100 Reference Sheet</a></li>
+<li><a href="https://regex101.com/">Regex101.com</a>
+<ul>
+<li>Be sure to check Python under the category on the left.</li>
+</ul></li>
+</ul>
+<section id="basics-regex-syntax" class="level3" data-number="6.3.1">
+<h3 data-number="6.3.1" class="anchored" data-anchor-id="basics-regex-syntax"><span class="header-section-number">6.3.1</span> Basics RegEx Syntax</h3>
+<p>There are four basic operations with regular expressions.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 25%">
+<col style="width: 18%">
+<col style="width: 17%">
+<col style="width: 14%">
+<col style="width: 20%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Operation</th>
+<th>Order</th>
+<th>Syntax Example</th>
+<th>Matches</th>
+<th>Doesn’t Match</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><code>Or</code>: <code>|</code></td>
+<td>4</td>
+<td>AA|BAAB</td>
+<td>AA<br> BAAB</td>
+<td>every other string</td>
+</tr>
+<tr class="even">
+<td><code>Concatenation</code></td>
+<td>3</td>
+<td>AABAAB</td>
+<td>AABAAB</td>
+<td>every other string</td>
+</tr>
+<tr class="odd">
+<td><code>Closure</code>: <code>*</code> <br> (zero or more)</td>
+<td>2</td>
+<td>AB*A</td>
+<td>AA ABBBBBBA</td>
+<td>AB <br> ABABA</td>
+</tr>
+<tr class="even">
+<td><code>Group</code>: <code>()</code> <br> (parenthesis)</td>
+<td>1</td>
+<td>A(A|B)AAB <br> <br> <br> (AB)*A</td>
+<td>AAAAB ABAAB <br> <br> <br> A <br> ABABABABA</td>
+<td>every other string <br> <br> <br> AA <br> ABBA</td>
+</tr>
+</tbody>
+</table>
+<p>Notice how these metacharacter operations are ordered. Rather than being literal characters, these <strong>metacharacters</strong> manipulate adjacent characters. <code>()</code> takes precedence, followed by <code>*</code>, and finally <code>|</code>. This allows us to differentiate between very different regex commands like <code>AB*</code> and <code>(AB)*</code>. The former reads “<code>A</code> then zero or more copies of <code>B</code>”, while the latter specifies “zero or more copies of <code>AB</code>”.</p>
+<section id="examples" class="level4" data-number="6.3.1.1">
+<h4 data-number="6.3.1.1" class="anchored" data-anchor-id="examples"><span class="header-section-number">6.3.1.1</span> Examples</h4>
+<p><strong>Question 1</strong>: Give a regular expression that matches <code>moon</code>, <code>moooon</code>, etc. Your expression should match any even number of <code>o</code>s except zero (i.e.&nbsp;don’t match <code>mn</code>).</p>
+<p><strong>Answer 1</strong>: <code>moo(oo)*n</code></p>
+<ul>
+<li>Hardcoding <code>oo</code> before the capture group ensures that <code>mn</code> is not matched.</li>
+<li>A capture group of <code>(oo)*</code> ensures the number of <code>o</code>’s is even.</li>
+</ul>
+<p><strong>Question 2</strong>: Using only basic operations, formulate a regex that matches <code>muun</code>, <code>muuuun</code>, <code>moon</code>, <code>moooon</code>, etc. Your expression should match any even number of <code>u</code>s or <code>o</code>s except zero (i.e.&nbsp;don’t match <code>mn</code>).</p>
+<p><strong>Answer 2</strong>: <code>m(uu(uu)*|oo(oo)*)n</code></p>
+<ul>
+<li>The leading <code>m</code> and trailing <code>n</code> ensures that only strings beginning with <code>m</code> and ending with <code>n</code> are matched.</li>
+<li>Notice how the outer capture group surrounds the <code>|</code>.
+<ul>
+<li>Consider the regex <code>m(uu(uu)*)|(oo(oo)*)n</code>. This incorrectly matches <code>muu</code> and <code>oooon</code>.
+<ul>
+<li>Each OR clause is everything to the left and right of <code>|</code>. The incorrect solution matches only half of the string, and ignores either the beginning <code>m</code> or trailing <code>n</code>.</li>
+<li>A set of parenthesis must surround <code>|</code>. That way, each OR clause is everything to the left and right of <code>|</code> <strong>within</strong> the group. This ensures both the beginning <code>m</code> <em>and</em> trailing <code>n</code> are matched.</li>
+</ul></li>
+</ul></li>
+</ul>
+</section>
+</section>
+</section>
+<section id="regex-expanded" class="level2" data-number="6.4">
+<h2 data-number="6.4" class="anchored" data-anchor-id="regex-expanded"><span class="header-section-number">6.4</span> RegEx Expanded</h2>
+<p>Provided below are more complex regular expression functions.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 46%">
+<col style="width: 17%">
+<col style="width: 16%">
+<col style="width: 18%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Operation</th>
+<th>Syntax Example</th>
+<th>Matches</th>
+<th>Doesn’t Match</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><code>Any Character</code>: <code>.</code> <br> (except newline)</td>
+<td>.U.U.U.</td>
+<td>CUMULUS <br> JUGULUM</td>
+<td>SUCCUBUS TUMULTUOUS</td>
+</tr>
+<tr class="even">
+<td><code>Character Class</code>: <code>[]</code> <br> (match one character in <code>[]</code>)</td>
+<td>[A-Za-z][a-z]*</td>
+<td>word <br> Capitalized</td>
+<td>camelCase 4illegal</td>
+</tr>
+<tr class="odd">
+<td><code>Repeated "a" Times</code>: <code>{a}</code><br></td>
+<td>j[aeiou]{3}hn</td>
+<td>jaoehn <br> jooohn</td>
+<td>jhn <br> jaeiouhn</td>
+</tr>
+<tr class="even">
+<td><code>Repeated "from a to b" Times</code>: <code>{a, b}</code><br></td>
+<td>j[ou]{1,2}hn</td>
+<td>john <br> juohn</td>
+<td>jhn <br> jooohn</td>
+</tr>
+<tr class="odd">
+<td><code>At Least One</code>: <code>+</code></td>
+<td>jo+hn</td>
+<td>john <br> joooooohn</td>
+<td>jhn <br> jjohn</td>
+</tr>
+<tr class="even">
+<td><code>Zero or One</code>: <code>?</code></td>
+<td>joh?n</td>
+<td>jon <br> john</td>
+<td>any other string</td>
+</tr>
+</tbody>
+</table>
+<p>A character class matches a single character in its class. These characters can be hardcoded —— in the case of <code>[aeiou]</code> —— or shorthand can be specified to mean a range of characters. Examples include:</p>
+<ol type="1">
+<li><code>[A-Z]</code>: Any capitalized letter</li>
+<li><code>[a-z]</code>: Any lowercase letter</li>
+<li><code>[0-9]</code>: Any single digit</li>
+<li><code>[A-Za-z]</code>: Any capitalized of lowercase letter</li>
+<li><code>[A-Za-z0-9]</code>: Any capitalized or lowercase letter or single digit</li>
+</ol>
+<section id="examples-1" class="level4" data-number="6.4.0.1">
+<h4 data-number="6.4.0.1" class="anchored" data-anchor-id="examples-1"><span class="header-section-number">6.4.0.1</span> Examples</h4>
+<p>Let’s analyze a few examples of complex regular expressions.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 47%">
+<col style="width: 47%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Matches</th>
+<th>Does Not Match</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><ol type="1">
+<li><code>.*SPB.*</code></li>
+</ol></td>
+<td></td>
+</tr>
+<tr class="even">
+<td>RASPBERRY <br> SPBOO</td>
+<td>SUBSPACE <br> SUBSPECIES</td>
+</tr>
+<tr class="odd">
+<td><ol start="2" type="1">
+<li><code>[0-9]{3}-[0-9]{2}-[0-9]{4}</code></li>
+</ol></td>
+<td></td>
+</tr>
+<tr class="even">
+<td>231-41-5121 <br> 573-57-1821</td>
+<td>231415121 <br> 57-3571821</td>
+</tr>
+<tr class="odd">
+<td><ol start="3" type="1">
+<li><code>[a-z]+@([a-z]+\.)+(edu|com)</code></li>
+</ol></td>
+<td></td>
+</tr>
+<tr class="even">
+<td>horse@pizza.com <br> horse@pizza.food.com</td>
+<td>frank_99@yahoo.com <br> hug@cs</td>
+</tr>
+</tbody>
+</table>
+<p><strong>Explanations</strong></p>
+<ol type="1">
+<li><code>.*SPB.*</code> only matches strings that contain the substring <code>SPB</code>.
+<ul>
+<li>The <code>.*</code> metacharacter matches any amount of non-negative characters. Newlines do not count.<br>
+</li>
+</ul></li>
+<li>This regular expression matches 3 of any digit, then a dash, then 2 of any digit, then a dash, then 4 of any digit.
+<ul>
+<li>You’ll recognize this as the familiar Social Security Number regular expression.</li>
+</ul></li>
+<li>Matches any email with a <code>com</code> or <code>edu</code> domain, where all characters of the email are letters.
+<ul>
+<li>At least one <code>.</code> must precede the domain name. Including a backslash <code>\</code> before any metacharacter (in this case, the <code>.</code>) tells RegEx to match that character exactly.</li>
+</ul></li>
+</ol>
+</section>
+</section>
+<section id="convenient-regex" class="level2" data-number="6.5">
+<h2 data-number="6.5" class="anchored" data-anchor-id="convenient-regex"><span class="header-section-number">6.5</span> Convenient RegEx</h2>
+<p>Here are a few more convenient regular expressions.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 46%">
+<col style="width: 17%">
+<col style="width: 16%">
+<col style="width: 18%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Operation</th>
+<th>Syntax Example</th>
+<th>Matches</th>
+<th>Doesn’t Match</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><code>built in character class</code></td>
+<td><code>\w+</code> <br> <code>\d+</code><br> <code>\s+</code> <br></td>
+<td>Fawef_03 <br> 231123 <br> <code>whitespace</code></td>
+<td>this person<br> 423 people<br> <code>non-whitespace</code></td>
+</tr>
+<tr class="even">
+<td><code>character class negation</code>: <code>[^]</code> (everything except the given characters)</td>
+<td>[^a-z]+.</td>
+<td>PEPPERS3982 17211!↑å</td>
+<td>porch <br> CLAmS</td>
+</tr>
+<tr class="odd">
+<td><code>escape character</code>: <code>\</code> <br> (match the literal next character)</td>
+<td>cow\.com</td>
+<td>cow.com</td>
+<td>cowscom</td>
+</tr>
+<tr class="even">
+<td><code>beginning of line</code>: <code>^</code></td>
+<td>^ark</td>
+<td>ark two ark o ark</td>
+<td>dark</td>
+</tr>
+<tr class="odd">
+<td><code>end of line</code>: <code>$</code></td>
+<td>ark$</td>
+<td>dark <br> ark o ark</td>
+<td>ark two</td>
+</tr>
+<tr class="even">
+<td><code>lazy version of zero or more</code> : <code>*?</code></td>
+<td>5.*?5</td>
+<td>5005 <br> 55</td>
+<td>5005005</td>
+</tr>
+</tbody>
+</table>
+<section id="greediness" class="level3" data-number="6.5.1">
+<h3 data-number="6.5.1" class="anchored" data-anchor-id="greediness"><span class="header-section-number">6.5.1</span> Greediness</h3>
+<p>In order to fully understand the last operation in the table, we have to discuss greediness. RegEx is greedy – it will look for the longest possible match in a string. To motivate this with an example, consider the pattern <code>&lt;div&gt;.*&lt;/div&gt;</code>. In the sentence below, we would hope that the bolded portions would be matched:</p>
+<p>“This is a <strong>&lt;div&gt;example&lt;/div&gt;</strong> of greediness <strong>&lt;div&gt;in&lt;/div&gt;</strong> regular expressions.”</p>
+<p>However, in reality, RegEx captures far more of the sentence. The way RegEx processes the text given that pattern is as follows:</p>
+<ol type="1">
+<li><p>“Look for the exact string &lt;&gt;”</p></li>
+<li><p>Then, “look for any character 0 or more times”</p></li>
+<li><p>Then, “look for the exact string &lt;/div&gt;”</p></li>
+</ol>
+<p>The result would be all the characters starting from the leftmost &lt;div&gt; and the rightmost &lt;/div&gt; (inclusive):</p>
+<p>“This is a <strong>&lt;div&gt;example&lt;/div&gt; of greediness &lt;div&gt;in&lt;/div&gt;</strong> regular expressions.”</p>
+<p>We can fix this by making our pattern non-greedy, <code>&lt;div&gt;.*?&lt;/div&gt;</code>. You can read up more in the documentation <a href="https://docs.python.org/3/howto/regex.html#greedy-versus-non-greedy">here</a>.</p>
+</section>
+<section id="examples-2" class="level3" data-number="6.5.2">
+<h3 data-number="6.5.2" class="anchored" data-anchor-id="examples-2"><span class="header-section-number">6.5.2</span> Examples</h3>
+<p>Let’s revisit our earlier problem of extracting date/time data from the given <code>.txt</code> files. Here is how the data looked.</p>
+<div id="7d3b256d" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>log_lines[<span class="dv">0</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<pre><code>'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'</code></pre>
+</div>
+</div>
+<p><strong>Question</strong>: Give a regular expression that matches everything contained within and including the brackets - the day, month, year, hour, minutes, seconds, and time zone.</p>
+<p><strong>Answer</strong>: <code>\[.*\]</code></p>
+<ul>
+<li>Notice how matching the literal <code>[</code> and <code>]</code> is necessary. Therefore, an escape character <code>\</code> is required before both <code>[</code> and <code>]</code> — otherwise these metacharacters will match character classes.</li>
+<li>We need to match a particular format between <code>[</code> and <code>]</code>. For this example, <code>.*</code> will suffice.</li>
+</ul>
+<p><strong>Alternative Solution</strong>: <code>\[\w+/\w+/\w+:\w+:\w+:\w+\s-\w+\]</code></p>
+<ul>
+<li>This solution is much safer.
+<ul>
+<li>Imagine the data between <code>[</code> and <code>]</code> was garbage - <code>.*</code> will still match that.</li>
+<li>The alternate solution will only match data that follows the correct format.</li>
+</ul></li>
+</ul>
+</section>
+</section>
+<section id="regex-in-python-and-pandas-regex-groups" class="level2" data-number="6.6">
+<h2 data-number="6.6" class="anchored" data-anchor-id="regex-in-python-and-pandas-regex-groups"><span class="header-section-number">6.6</span> Regex in Python and Pandas (RegEx Groups)</h2>
+<section id="canonicalization-1" class="level3" data-number="6.6.1">
+<h3 data-number="6.6.1" class="anchored" data-anchor-id="canonicalization-1"><span class="header-section-number">6.6.1</span> Canonicalization</h3>
+<section id="canonicalization-with-regex" class="level4" data-number="6.6.1.1">
+<h4 data-number="6.6.1.1" class="anchored" data-anchor-id="canonicalization-with-regex"><span class="header-section-number">6.6.1.1</span> Canonicalization with RegEx</h4>
+<p>Earlier in this note, we examined the process of canonicalization using <code>python</code> string manipulation and <code>pandas</code> <code>Series</code> methods. However, we mentioned this approach had a major flaw: our code was unnecessarily verbose. Equipped with our knowledge of regular expressions, let’s fix this.</p>
+<p>To do so, we need to understand a few functions in the <code>re</code> module. The first of these is the substitute function: <code>re.sub(pattern, rep1, text)</code>. It behaves similarly to <code>python</code>’s built-in <code>.replace</code> function, and returns text with all instances of <code>pattern</code> replaced by <code>rep1</code>.</p>
+<p>The regular expression here removes text surrounded by <code>&lt;&gt;</code> (also known as HTML tags).</p>
+<p>In order, the pattern matches … 1. a single <code>&lt;</code> 2. any character that is not a <code>&gt;</code> : div, td valign…, /td, /div 3. a single <code>&gt;</code></p>
+<p>Any substring in <code>text</code> that fulfills all three conditions will be replaced by <code>''</code>.</p>
+<div id="a25caf9c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> re</span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>text <span class="op">=</span> <span class="st">"&lt;div&gt;&lt;td valign='top'&gt;Moo&lt;/td&gt;&lt;/div&gt;"</span></span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>pattern <span class="op">=</span> <span class="vs">r"&lt;[^&gt;]+&gt;"</span></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>re.sub(pattern, <span class="st">''</span>, text) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="10">
+<pre><code>'Moo'</code></pre>
+</div>
+</div>
+<p>Notice the <code>r</code> preceding the regular expression pattern; this specifies the regular expression is a raw string. Raw strings do not recognize escape sequences (i.e., the Python newline metacharacter <code>\n</code>). This makes them useful for regular expressions, which often contain literal <code>\</code> characters.</p>
+<p>In other words, don’t forget to tag your RegEx with an <code>r</code>.</p>
+</section>
+<section id="canonicalization-with-pandas" class="level4" data-number="6.6.1.2">
+<h4 data-number="6.6.1.2" class="anchored" data-anchor-id="canonicalization-with-pandas"><span class="header-section-number">6.6.1.2</span> Canonicalization with <code>pandas</code></h4>
+<p>We can also use regular expressions with <code>pandas</code> <code>Series</code> methods. This gives us the benefit of operating on an entire column of data as opposed to a single value. The code is simple: <br> <code>ser.str.replace(pattern, repl, regex=True</code>).</p>
+<p>Consider the following <code>DataFrame</code> <code>html_data</code> with a single column.</p>
+<div id="e0a22a68" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="11">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>data <span class="op">=</span> {<span class="st">"HTML"</span>: [<span class="st">"&lt;div&gt;&lt;td valign='top'&gt;Moo&lt;/td&gt;&lt;/div&gt;"</span>, <span class="op">\</span></span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>                 <span class="st">"&lt;a href='http://ds100.org'&gt;Link&lt;/a&gt;"</span>, <span class="op">\</span></span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>                 <span class="st">"&lt;b&gt;Bold text&lt;/b&gt;"</span>]}</span>
+<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a>html_data <span class="op">=</span> pd.DataFrame(data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="061ab2ed" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>html_data</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="12">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">HTML</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>&lt;div&gt;&lt;td valign='top'&gt;Moo&lt;/td&gt;&lt;/div&gt;</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>&lt;a href='http://ds100.org'&gt;Link&lt;/a&gt;</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>&lt;b&gt;Bold text&lt;/b&gt;</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="143a6833" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>pattern <span class="op">=</span> <span class="vs">r"&lt;[^&gt;]+&gt;"</span></span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>html_data[<span class="st">'HTML'</span>].<span class="bu">str</span>.replace(pattern, <span class="st">''</span>, regex<span class="op">=</span><span class="va">True</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="13">
+<pre><code>0          Moo
+1         Link
+2    Bold text
+Name: HTML, dtype: object</code></pre>
+</div>
+</div>
+</section>
+</section>
+<section id="extraction-1" class="level3" data-number="6.6.2">
+<h3 data-number="6.6.2" class="anchored" data-anchor-id="extraction-1"><span class="header-section-number">6.6.2</span> Extraction</h3>
+<section id="extraction-with-regex" class="level4" data-number="6.6.2.1">
+<h4 data-number="6.6.2.1" class="anchored" data-anchor-id="extraction-with-regex"><span class="header-section-number">6.6.2.1</span> Extraction with RegEx</h4>
+<p>Just like with canonicalization, the <code>re</code> module provides capability to extract relevant text from a string: <br> <code>re.findall(pattern, text)</code>. This function returns a list of all matches to <code>pattern</code>.</p>
+<p>Using the familiar regular expression for Social Security Numbers:</p>
+<div id="de93a52c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>text <span class="op">=</span> <span class="st">"My social security number is 123-45-6789 bro, or maybe it’s 321-45-6789."</span></span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>pattern <span class="op">=</span> <span class="vs">r"[0-9]</span><span class="sc">{3}</span><span class="vs">-[0-9]</span><span class="sc">{2}</span><span class="vs">-[0-9]</span><span class="sc">{4}</span><span class="vs">"</span></span>
+<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>re.findall(pattern, text)  </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<pre><code>['123-45-6789', '321-45-6789']</code></pre>
+</div>
+</div>
+</section>
+<section id="extraction-with-pandas" class="level4" data-number="6.6.2.2">
+<h4 data-number="6.6.2.2" class="anchored" data-anchor-id="extraction-with-pandas"><span class="header-section-number">6.6.2.2</span> Extraction with <code>pandas</code></h4>
+<p><code>pandas</code> similarily provides extraction functionality on a <code>Series</code> of data: <code>ser.str.findall(pattern)</code></p>
+<p>Consider the following <code>DataFrame</code> <code>ssn_data</code>.</p>
+<div id="2f44bfaa" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="15">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>data <span class="op">=</span> {<span class="st">"SSN"</span>: [<span class="st">"987-65-4321"</span>, <span class="st">"forty"</span>, <span class="op">\</span></span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>                <span class="st">"123-45-6789 bro or 321-45-6789"</span>,</span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>               <span class="st">"999-99-9999"</span>]}</span>
+<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>ssn_data <span class="op">=</span> pd.DataFrame(data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="3a671ab9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>ssn_data</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="16">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">SSN</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>987-65-4321</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>forty</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>123-45-6789 bro or 321-45-6789</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>999-99-9999</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="979abd97" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>ssn_data[<span class="st">"SSN"</span>].<span class="bu">str</span>.findall(pattern)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="17">
+<pre><code>0                 [987-65-4321]
+1                            []
+2    [123-45-6789, 321-45-6789]
+3                 [999-99-9999]
+Name: SSN, dtype: object</code></pre>
+</div>
+</div>
+<p>This function returns a list for every row containing the pattern matches in a given string.</p>
+<p>As you may expect, there are similar <code>pandas</code> equivalents for other <code>re</code> functions as well. <code>Series.str.extract</code> takes in a pattern and returns a <code>DataFrame</code> of each capture group’s first match in the string. In contrast, <code>Series.str.extractall</code> returns a multi-indexed <code>DataFrame</code> of all matches for each capture group. You can see the difference in the outputs below:</p>
+<div id="2c4c53f4" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>pattern_cg <span class="op">=</span> <span class="vs">r"([0-9]</span><span class="sc">{3}</span><span class="vs">)-([0-9]</span><span class="sc">{2}</span><span class="vs">)-([0-9]</span><span class="sc">{4}</span><span class="vs">)"</span></span>
+<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>ssn_data[<span class="st">"SSN"</span>].<span class="bu">str</span>.extract(pattern_cg)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="18">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+<th data-quarto-table-cell-role="th">1</th>
+<th data-quarto-table-cell-role="th">2</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>987</td>
+<td>65</td>
+<td>4321</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>123</td>
+<td>45</td>
+<td>6789</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>999</td>
+<td>99</td>
+<td>9999</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="3b7b1eee" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="19">
+<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a>ssn_data[<span class="st">"SSN"</span>].<span class="bu">str</span>.extractall(pattern_cg)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="19">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">0</th>
+<th data-quarto-table-cell-role="th">1</th>
+<th data-quarto-table-cell-role="th">2</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">match</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td data-quarto-table-cell-role="th">0</td>
+<td>987</td>
+<td>65</td>
+<td>4321</td>
+</tr>
+<tr class="even">
+<td rowspan="2" data-quarto-table-cell-role="th" data-valign="top">2</td>
+<td data-quarto-table-cell-role="th">0</td>
+<td>123</td>
+<td>45</td>
+<td>6789</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">1</td>
+<td>321</td>
+<td>45</td>
+<td>6789</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td data-quarto-table-cell-role="th">0</td>
+<td>999</td>
+<td>99</td>
+<td>9999</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="regular-expression-capture-groups" class="level3" data-number="6.6.3">
+<h3 data-number="6.6.3" class="anchored" data-anchor-id="regular-expression-capture-groups"><span class="header-section-number">6.6.3</span> Regular Expression Capture Groups</h3>
+<p>Earlier we used parentheses <code>(</code> <code>)</code> to specify the highest order of operation in regular expressions. However, they have another meaning; parentheses are often used to represent <strong>capture groups</strong>. Capture groups are essentially, a set of smaller regular expressions that match multiple substrings in text data.</p>
+<p>Let’s take a look at an example.</p>
+<section id="example-1" class="level4" data-number="6.6.3.1">
+<h4 data-number="6.6.3.1" class="anchored" data-anchor-id="example-1"><span class="header-section-number">6.6.3.1</span> Example 1</h4>
+<div id="bf469a20" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="20">
+<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a>text <span class="op">=</span> <span class="st">"Observations: 03:04:53 - Horse awakens. </span><span class="ch">\</span></span>
+<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a><span class="st">        03:05:14 - Horse goes back to sleep."</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Say we want to capture all occurences of time data (hour, minute, and second) as <em>separate entities</em>.</p>
+<div id="d4bd3207" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="21">
+<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a>pattern_1 <span class="op">=</span> <span class="vs">r"(\d\d):(\d\d):(\d\d)"</span></span>
+<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>re.findall(pattern_1, text)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="21">
+<pre><code>[('03', '04', '53'), ('03', '05', '14')]</code></pre>
+</div>
+</div>
+<p>Notice how the given pattern has 3 capture groups, each specified by the regular expression <code>(\d\d)</code>. We then use <code>re.findall</code> to return these capture groups, each as tuples containing 3 matches.</p>
+<p>These regular expression capture groups can be different. We can use the <code>(\d{2})</code> shorthand to extract the same data.</p>
+<div id="c81961d0" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="22">
+<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a>pattern_2 <span class="op">=</span> <span class="vs">r"(\d\d):(\d\d):(\d</span><span class="sc">{2}</span><span class="vs">)"</span></span>
+<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a>re.findall(pattern_2, text)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="22">
+<pre><code>[('03', '04', '53'), ('03', '05', '14')]</code></pre>
+</div>
+</div>
+</section>
+<section id="example-2" class="level4" data-number="6.6.3.2">
+<h4 data-number="6.6.3.2" class="anchored" data-anchor-id="example-2"><span class="header-section-number">6.6.3.2</span> Example 2</h4>
+<p>With the notion of capture groups, convince yourself how the following regular expression works.</p>
+<div id="e49caa4d" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="23">
+<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a>first <span class="op">=</span> log_lines[<span class="dv">0</span>]</span>
+<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a>first</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="23">
+<pre><code>'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'</code></pre>
+</div>
+</div>
+<div id="f7eff79c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="24">
+<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a>pattern <span class="op">=</span> <span class="vs">r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'</span></span>
+<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>day, month, year, hour, minute, second, time_zone <span class="op">=</span> re.findall(pattern, first)[<span class="dv">0</span>]</span>
+<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(day, month, year, hour, minute, second, time_zone)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>26 Jan 2014 10 47 58 -0800</code></pre>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="limitations-of-regular-expressions" class="level2" data-number="6.7">
+<h2 data-number="6.7" class="anchored" data-anchor-id="limitations-of-regular-expressions"><span class="header-section-number">6.7</span> Limitations of Regular Expressions</h2>
+<p>Today, we explored the capabilities of regular expressions in data wrangling with text data. However, there are a few things to be wary of.</p>
+<p>Writing regular expressions is like writing a program.</p>
+<ul>
+<li>Need to know the syntax well.</li>
+<li>Can be easier to write than to read.</li>
+<li>Can be difficult to debug.</li>
+</ul>
+<p>Regular expressions are terrible at certain types of problems:</p>
+<ul>
+<li>For parsing a hierarchical structure, such as JSON, use the <code>json.load()</code> parser, not RegEx!</li>
+<li>Complex features (e.g.&nbsp;valid email address).</li>
+<li>Counting (same number of instances of a and b). (impossible)</li>
+<li>Complex properties (palindromes, balanced parentheses). (impossible)</li>
+</ul>
+<p>Ultimately, the goal is not to memorize all regular expressions. Rather, the aim is to:</p>
+<ul>
+<li>Understand what RegEx is capable of.</li>
+<li>Parse and create RegEx, with a reference table</li>
+<li>Use vocabulary (metacharacter, escape character, groups, etc.) to describe regex metacharacters.</li>
+<li>Differentiate between (), [], {}</li>
+<li>Design your own character classes with , , […-…], ^, etc.</li>
+<li>Use <code>python</code> and <code>pandas</code> RegEx methods.</li>
+</ul>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../eda/eda.html" class="pagination-link" aria-label="Data Cleaning and EDA">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../visualization_1/visualization_1.html" class="pagination-link" aria-label="Visualization I">
+        <span class="nav-page-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/sampling/images/data_life_cycle_sampling.png b/docs/sampling/images/data_life_cycle_sampling.png
new file mode 100644
index 000000000..ea49768ed
Binary files /dev/null and b/docs/sampling/images/data_life_cycle_sampling.png differ
diff --git a/docs/sampling/images/samplingframe.png b/docs/sampling/images/samplingframe.png
new file mode 100644
index 000000000..fba469633
Binary files /dev/null and b/docs/sampling/images/samplingframe.png differ
diff --git a/docs/sampling/sampling.html b/docs/sampling/sampling.html
new file mode 100644
index 000000000..167de7548
--- /dev/null
+++ b/docs/sampling/sampling.html
@@ -0,0 +1,1275 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>9&nbsp; Sampling – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../intro_to_modeling/intro_to_modeling.html" rel="next">
+<link href="../visualization_2/visualization_2.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../sampling/sampling.html"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#censuses-and-surveys" id="toc-censuses-and-surveys" class="nav-link active" data-scroll-target="#censuses-and-surveys"><span class="header-section-number">9.1</span> Censuses and Surveys</a></li>
+  <li><a href="#bias-a-case-study" id="toc-bias-a-case-study" class="nav-link" data-scroll-target="#bias-a-case-study"><span class="header-section-number">9.2</span> Bias: A Case Study</a></li>
+  <li><a href="#probability-samples" id="toc-probability-samples" class="nav-link" data-scroll-target="#probability-samples"><span class="header-section-number">9.3</span> Probability Samples</a>
+  <ul>
+  <li><a href="#example-scheme-1-probability-sample" id="toc-example-scheme-1-probability-sample" class="nav-link" data-scroll-target="#example-scheme-1-probability-sample"><span class="header-section-number">9.3.1</span> Example Scheme 1: Probability Sample</a></li>
+  <li><a href="#example-scheme-2-simple-random-sample" id="toc-example-scheme-2-simple-random-sample" class="nav-link" data-scroll-target="#example-scheme-2-simple-random-sample"><span class="header-section-number">9.3.2</span> Example Scheme 2: Simple Random Sample</a></li>
+  <li><a href="#demo-barbie-v.-oppenheimer" id="toc-demo-barbie-v.-oppenheimer" class="nav-link" data-scroll-target="#demo-barbie-v.-oppenheimer"><span class="header-section-number">9.3.3</span> Demo: Barbie v. Oppenheimer</a>
+  <ul>
+  <li><a href="#convenience-sample-retirees" id="toc-convenience-sample-retirees" class="nav-link" data-scroll-target="#convenience-sample-retirees"><span class="header-section-number">9.3.3.1</span> Convenience Sample: Retirees</a></li>
+  <li><a href="#check-for-bias" id="toc-check-for-bias" class="nav-link" data-scroll-target="#check-for-bias"><span class="header-section-number">9.3.3.2</span> Check for Bias</a></li>
+  <li><a href="#simple-random-sample" id="toc-simple-random-sample" class="nav-link" data-scroll-target="#simple-random-sample"><span class="header-section-number">9.3.3.3</span> Simple Random Sample</a></li>
+  <li><a href="#quantifying-chance-error" id="toc-quantifying-chance-error" class="nav-link" data-scroll-target="#quantifying-chance-error"><span class="header-section-number">9.3.3.4</span> Quantifying Chance Error</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#summary" id="toc-summary" class="nav-link" data-scroll-target="#summary"><span class="header-section-number">9.4</span> Summary</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Understand how to appropriately collect data to help answer a question.</li>
+</ul>
+</div>
+</div>
+</div>
+<p>In data science, understanding characteristics of a population starts with having quality data to investigate. While it is often impossible to collect all the data describing a population, we can overcome this by properly sampling from the population. In this note, we will discuss appropriate techniques for sampling from populations.</p>
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="images/data_life_cycle_sampling.png" class="img-fluid figure-img"></p>
+<figcaption>Lifecycle diagram</figcaption>
+</figure>
+</div>
+<section id="censuses-and-surveys" class="level2" data-number="9.1">
+<h2 data-number="9.1" class="anchored" data-anchor-id="censuses-and-surveys"><span class="header-section-number">9.1</span> Censuses and Surveys</h2>
+<p>In general: a <strong>census</strong> is “a complete count or survey of a <strong>population</strong>, typically recording various details of <strong>individuals</strong>.” An example is the U.S. Decennial Census which was held in April 2020. It counts <em>every person</em> living in all 50 states, DC, and US territories, not just citizens. Participation is required by law (it is mandated by the U.S. Constitution). Important uses include the allocation of Federal funds, congressional representation, and drawing congressional and state legislative districts. The census is composed of a <strong>survey</strong> mailed to different housing addresses in the United States.</p>
+<p>A <strong>survey</strong> is a set of questions. An example is workers sampling individuals and households. What is asked and how it is asked can affect how the respondent answers or even whether or not they answer in the first place.</p>
+<p>While censuses are great, it is often very difficult and expensive to survey everyone in a population. Imagine the amount of resources, money, time, and energy the U.S. spent on the 2020 Census. While this does give us more accurate information about the population, it’s often infeasible to execute. Thus, we usually survey a subset of the population instead.</p>
+<p>A <strong>sample</strong> is (usually) a subset of the population that is often used to make inferences about the population. If our sample is a good representation of our population, then we can use it to glean useful information at a lower cost. That being said, how the sample is drawn will affect the reliability of such inferences. Two common sources of error in sampling are <strong>chance error</strong>, where random samples can vary from what is expected in any direction, and <strong>bias</strong>, which is a systematic error in one direction. Biases can be the result of many things, for example, our sampling scheme or survey methods.</p>
+<p>Let’s define some useful vocabulary:</p>
+<ul>
+<li><strong>Population</strong>: The group that you want to learn something about.
+<ul>
+<li><strong>Individuals</strong> in a population are not always people. Other populations include bacteria in your gut (sampled using DNA sequencing), trees of a certain species, small businesses receiving a microloan, or published results in an academic journal or field.</li>
+</ul></li>
+<li><strong>Sampling Frame</strong>: The list from which the sample is drawn.
+<ul>
+<li>For example, if sampling people, then the sampling frame is the set of all people that could possibly end up in your sample.</li>
+</ul></li>
+<li><strong>Sample</strong>: Who you actually end up sampling. The sample is therefore a subset of your <em>sampling frame</em>.</li>
+</ul>
+<p>While ideally, these three sets would be exactly the same, they usually aren’t in practice. For example, there may be individuals in your sampling frame (and hence, your sample) that are not in your population. And generally, sample sizes are much smaller than population sizes.</p>
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="images/samplingframe.png" class="img-fluid figure-img"></p>
+<figcaption>Sampling_Frames</figcaption>
+</figure>
+</div>
+</section>
+<section id="bias-a-case-study" class="level2" data-number="9.2">
+<h2 data-number="9.2" class="anchored" data-anchor-id="bias-a-case-study"><span class="header-section-number">9.2</span> Bias: A Case Study</h2>
+<p>The following case study is adapted from <em>Statistics</em> by Freedman, Pisani, and Purves, W.W. Norton NY, 1978.</p>
+<p>In 1936, President Franklin D. Roosevelt (Democratic) went up for re-election against Alf Landon (Republican). As is usual, <strong>polls</strong> were conducted in the months leading up to the election to try and predict the outcome. The <em>Literary Digest</em> was a magazine that had successfully predicted the outcome of 5 general elections coming into 1936. In their polling for the 1936 election, they sent out their survey to 10 million individuals whom they found from phone books, lists of magazine subscribers, and lists of country club members. Of the roughly 2.4 million people who filled out the survey, only 43% reported they would vote for Roosevelt; thus, the <em>Digest</em> predicted that Landon would win.</p>
+<p>On election day, Roosevelt won in a landslide, winning 61% of the popular vote of about 45 million voters. How could the <em>Digest</em> have been so wrong with their polling?</p>
+<p>It turns out that the <em>Literary Digest</em> sample was not representative of the population. Their sampling frame of people found in phone books, lists of magazine subscribers, and lists of country club members were more affluent and tended to vote Republican. As such, their sampling frame was inherently skewed in Landon’s favor. The <em>Literary Digest</em> completely overlooked the lion’s share of voters who were still suffering through the Great Depression. Furthermore, they had a dismal response rate (about 24%); who knows how the other non-respondents would have polled? The <em>Digest</em> folded just 18 months after this disaster.</p>
+<p>At the same time, George Gallup, a rising statistician, also made predictions about the 1936 elections. Despite having a smaller sample size of “only” 50,000 (this is still more than necessary; more when we cover the Central Limit Theorem), his estimate that 56% of voters would choose Roosevelt was much closer to the actual result (61%). Gallup also predicted the <em>Digest</em>’s prediction within 1% with a sample size of only 3000 people by anticipating the <em>Digest</em>’s affluent sampling frame and subsampling those individuals.</p>
+<p>So what’s the moral of the story? Samples, while convenient, are subject to chance error and <strong>bias</strong>. Election polling, in particular, can involve many sources of bias. To name a few:</p>
+<ul>
+<li><strong>Selection bias</strong> systematically excludes (or favors) particular groups.
+<ul>
+<li>Example: the Literary Digest poll excludes people not in phone books.</li>
+<li>How to avoid: Examine the sampling frame and the method of sampling.</li>
+</ul></li>
+<li><strong>Response bias</strong> occurs because people don’t always respond truthfully. Survey designers pay special detail to the nature and wording of questions to avoid this type of bias.
+<ul>
+<li>Example: Illegal immigrants might not answer truthfully when asked citizenship questions on the census survey.</li>
+<li>How to avoid: Examine the nature of questions and the method of surveying. Randomized response - flip a coin and answer yes if heads or answer truthfully if tails.</li>
+</ul></li>
+<li><strong>Non-response bias</strong> occurs because people don’t always respond to survey requests, which can skew responses.
+<ul>
+<li>Example: Only 2.4m out of 10m people responded to the <em>Literary Digest</em>’s poll.</li>
+<li>How to avoid: Keep surveys short, and be persistent.</li>
+</ul></li>
+</ul>
+<p><strong>Randomized Response</strong></p>
+<p>Suppose you want to ask someone a sensitive question: “Have you ever cheated on an exam?” An individual may be embarrassed or afraid to answer truthfully and might lie or not answer the question. One solution is to leverage a randomized response:</p>
+<p>First, you can ask the individual to secretly flip a fair coin; you (the surveyor) <em>don’t</em> know the outcome of the coin flip.</p>
+<p>Then, you ask them to <strong>answer “Yes”</strong> if the coin landed heads and to <strong>answer truthfully</strong> if the coin landed tails.</p>
+<p>The surveyor doesn’t know if the <strong>“Yes”</strong> means that the <strong>person cheated</strong> or if it means that the <strong>coin landed heads</strong>. The individual’s sensitive information remains secret. However, if the response is <strong>“No”</strong>, then the surveyor knows the <strong>individual didn’t cheat</strong>. We assume the individual is comfortable revealing this information.</p>
+<p>Generally, we can assume that the coin lands heads 50% of the time, masking the remaining 50% of the “No” answers. We can therefore <strong>double</strong> the proportion of “No” answers to estimate the <strong>true</strong> fraction of “No” answers.</p>
+<p><strong>Election Polls</strong></p>
+<p>Today, the <em>Gallup Poll</em> is one of the leading polls for election results. The many sources of biases – who responds to polls? Do voters tell the truth? How can we predict turnout? – still remain, but the <em>Gallup Poll</em> uses several tactics to mitigate them. Within their sampling frame of “civilian, non-institutionalized population” of adults in telephone households in continental U.S., they use random digit dialing to include both listed/unlisted phone numbers and to avoid selection bias. Additionally, they use a within-household selection process to randomly select households with one or more adults. If no one answers, re-call multiple times to avoid non-response bias.</p>
+</section>
+<section id="probability-samples" class="level2" data-number="9.3">
+<h2 data-number="9.3" class="anchored" data-anchor-id="probability-samples"><span class="header-section-number">9.3</span> Probability Samples</h2>
+<p>When sampling, it is essential to focus on the quality of the sample rather than the quantity of the sample. A huge sample size does not fix a bad sampling method. Our main goal is to gather a sample that is representative of the population it came from. In this section, we’ll explore the different types of sampling and their pros and cons.</p>
+<p>A <strong>convenience sample</strong> is whatever you can get ahold of; this type of sampling is <em>non-random</em>. Note that haphazard sampling is not necessarily random sampling; there are many potential sources of bias.</p>
+<p>In a <strong>probability sample</strong>, we provide the <strong>chance</strong> that any specified <strong>set</strong> of individuals will be in the sample (individuals in the population can have different chances of being selected; they don’t all have to be uniform), and we sample at random based off this known chance. For this reason, probability samples are also called <strong>random samples</strong>. The randomness provides a few benefits:</p>
+<ul>
+<li>Because we know the source probabilities, we can <strong>measure the errors</strong>.</li>
+<li>Sampling at random gives us a more representative sample of the population, which <strong>reduces bias</strong>. (Note: this is only the case when the probability distribution we’re sampling from is accurate. Random samples using “bad” or inaccurate distributions can produce biased estimates of population quantities.)</li>
+<li>Probability samples allow us to <strong>estimate</strong> the <strong>bias</strong> and <strong>chance error</strong>, which helps us <strong>quantify uncertainty</strong> (more in a future lecture).</li>
+</ul>
+<p>The real world is usually more complicated, and we often don’t know the initial probabilities. For example, we do not generally know the probability that a given bacterium is in a microbiome sample or whether people will answer when Gallup calls landlines. That being said, still we try to model probability sampling to the best of our ability even when the sampling or measurement process is not fully under our control.</p>
+<p>A few common random sampling schemes:</p>
+<ul>
+<li>A <strong>uniform random sample with replacement</strong> is a sample drawn <strong>uniformly</strong> at random <strong>with</strong> replacement.
+<ul>
+<li>Random doesn’t always mean “uniformly at random,” but in this specific context, it does.</li>
+<li>Some individuals in the population might get picked more than once.</li>
+</ul></li>
+<li>A <strong>simple random sample (SRS)</strong> is a sample drawn <strong>uniformly</strong> at random <strong>without</strong> replacement.
+<ul>
+<li>Every individual (and subset of individuals) has the same chance of being selected from the sampling frame.</li>
+<li>Every pair has the same chance as every other pair.</li>
+<li>Every triple has the same chance as every other triple.</li>
+<li>And so on.</li>
+</ul></li>
+<li>A <strong>stratified random sample</strong>, where random sampling is performed on strata (specific groups), and the groups together compose a sample.</li>
+</ul>
+<section id="example-scheme-1-probability-sample" class="level3" data-number="9.3.1">
+<h3 data-number="9.3.1" class="anchored" data-anchor-id="example-scheme-1-probability-sample"><span class="header-section-number">9.3.1</span> Example Scheme 1: Probability Sample</h3>
+<p>Suppose we have 3 TA’s (<strong>A</strong>rman, <strong>B</strong>oyu, <strong>C</strong>harlie): I decide to sample 2 of them as follows:</p>
+<ul>
+<li>I choose A with probability 1.0</li>
+<li>I choose either B or C, each with a probability of 0.5.</li>
+</ul>
+<p>We can list all the possible outcomes and their respective probabilities in a table:</p>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th>Outcome</th>
+<th>Probability</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>{A, B}</td>
+<td>0.5</td>
+</tr>
+<tr class="even">
+<td>{A, C}</td>
+<td>0.5</td>
+</tr>
+<tr class="odd">
+<td>{B, C}</td>
+<td>0</td>
+</tr>
+</tbody>
+</table>
+<p>This is a <strong>probability sample</strong> (though not a great one). Of the 3 people in my population, I know the chance of getting each subset. Suppose I’m measuring the average distance TAs live from campus.</p>
+<ul>
+<li>This scheme does not see the entire population!</li>
+<li>My estimate using the single sample I take has some chance error depending on if I see AB or AC.</li>
+<li>This scheme is biased towards A’s response.</li>
+</ul>
+</section>
+<section id="example-scheme-2-simple-random-sample" class="level3" data-number="9.3.2">
+<h3 data-number="9.3.2" class="anchored" data-anchor-id="example-scheme-2-simple-random-sample"><span class="header-section-number">9.3.2</span> Example Scheme 2: Simple Random Sample</h3>
+<p>Consider the following sampling scheme:</p>
+<ul>
+<li>A class roster has 1100 students listed alphabetically.</li>
+<li>Pick one of the first 10 students on the list at random (e.g.&nbsp;Student 8).</li>
+<li>To create your sample, take that student and every 10th student listed after that (e.g.&nbsp;Students 8, 18, 28, 38, etc.).</li>
+</ul>
+<details>
+<summary>
+Is this a probability sample?
+</summary>
+<p>Yes. For a sample [n, n + 10, n + 20, …, n + 1090], where 1 &lt;= n &lt;= 10, the probability of that sample is 1/10. Otherwise, the probability is 0.</p>
+Only 10 possible samples!
+</details>
+<details>
+<summary>
+Does each student have the same probability of being selected?
+</summary>
+Yes. Each student is chosen with a probability of 1/10.
+</details>
+<details>
+<summary>
+Is this a simple random sample?
+</summary>
+No.&nbsp;The chance of selecting (8, 18) is 1/10; the chance of selecting (8, 9) is 0.
+</details>
+</section>
+<section id="demo-barbie-v.-oppenheimer" class="level3" data-number="9.3.3">
+<h3 data-number="9.3.3" class="anchored" data-anchor-id="demo-barbie-v.-oppenheimer"><span class="header-section-number">9.3.3</span> Demo: Barbie v. Oppenheimer</h3>
+<p>We are trying to collect a sample from Berkeley residents to predict the which one of Barbie and Oppenheimer would perform better on their opening day, July 21st.</p>
+<p>First, let’s grab a dataset that has every single resident in Berkeley (this is a fake dataset) and which movie they <strong>actually</strong> watched on July 21st.</p>
+<p>Let’s load in the <code>movie.csv</code> table. We can assume that:</p>
+<ul>
+<li><code>is_male</code> is a boolean that indicates if a resident identifies as male.</li>
+<li>There are only two movies they can watch on July 21st: Barbie and Oppenheimer.</li>
+<li>Every resident watches a movie (either Barbie or Oppenheimer) on July 21st.</li>
+</ul>
+<div id="ef9aec7e" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>sns.set_theme(style<span class="op">=</span><span class="st">'darkgrid'</span>, font_scale <span class="op">=</span> <span class="fl">1.5</span>,</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>              rc<span class="op">=</span>{<span class="st">'figure.figsize'</span>:(<span class="dv">7</span>,<span class="dv">5</span>)})</span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a>rng <span class="op">=</span> np.random.default_rng()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="776907fd" class="cell" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>movie <span class="op">=</span> pd.read_csv(<span class="st">"data/movie.csv"</span>)</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co"># create a 1/0 int that indicates Barbie vote</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>movie[<span class="st">'barbie'</span>] <span class="op">=</span> (movie[<span class="st">'movie'</span>] <span class="op">==</span> <span class="st">'Barbie'</span>).astype(<span class="bu">int</span>)</span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>movie.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="2">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">age</th>
+<th data-quarto-table-cell-role="th">is_male</th>
+<th data-quarto-table-cell-role="th">movie</th>
+<th data-quarto-table-cell-role="th">barbie</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>35</td>
+<td>False</td>
+<td>Barbie</td>
+<td>1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>42</td>
+<td>True</td>
+<td>Oppenheimer</td>
+<td>0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>55</td>
+<td>False</td>
+<td>Barbie</td>
+<td>1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>77</td>
+<td>True</td>
+<td>Oppenheimer</td>
+<td>0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>31</td>
+<td>False</td>
+<td>Barbie</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>What fraction of Berkeley residents chose Barbie?</p>
+<div id="ab0f6821" class="cell" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>actual_barbie <span class="op">=</span> np.mean(movie[<span class="st">"barbie"</span>])</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>actual_barbie</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<pre><code>np.float64(0.5302792307692308)</code></pre>
+</div>
+</div>
+<p>This is the <strong>actual outcome</strong> of the competition. Based on this result, Barbie would win. How did our sample of retirees do?</p>
+<section id="convenience-sample-retirees" class="level4" data-number="9.3.3.1">
+<h4 data-number="9.3.3.1" class="anchored" data-anchor-id="convenience-sample-retirees"><span class="header-section-number">9.3.3.1</span> Convenience Sample: Retirees</h4>
+<p>Let’s take a convenience sample of people who have retired (&gt;= 65 years old). What proportion of them went to see Barbie instead of Oppenheimer?</p>
+<div id="0e5cd589" class="cell" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>convenience_sample <span class="op">=</span> movie[movie[<span class="st">'age'</span>] <span class="op">&gt;=</span> <span class="dv">65</span>] <span class="co"># take a convenience sample of retirees</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>np.mean(convenience_sample[<span class="st">"barbie"</span>]) <span class="co"># what proportion of them saw Barbie? </span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<pre><code>np.float64(0.3744755089093924)</code></pre>
+</div>
+</div>
+<p>Based on this result, we would have predicted that Oppenheimer would win! What happened? Is it possible that our sample is too small or noisy?</p>
+<div id="a8bb95e5" class="cell" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># what's the size of our sample? </span></span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="bu">len</span>(convenience_sample)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<pre><code>359396</code></pre>
+</div>
+</div>
+<div id="30b6026d" class="cell" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="co"># what proportion of our data is in the convenience sample? </span></span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="bu">len</span>(convenience_sample)<span class="op">/</span><span class="bu">len</span>(movie)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<pre><code>0.27645846153846154</code></pre>
+</div>
+</div>
+<p>Seems like our sample is rather large (roughly 360,000 people), so the error is likely not due to solely to chance.</p>
+</section>
+<section id="check-for-bias" class="level4" data-number="9.3.3.2">
+<h4 data-number="9.3.3.2" class="anchored" data-anchor-id="check-for-bias"><span class="header-section-number">9.3.3.2</span> Check for Bias</h4>
+<p>Let us aggregate all choices by age and visualize the fraction of Barbie views, split by gender.</p>
+<div id="585a50dc" class="cell" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>votes_by_barbie <span class="op">=</span> movie.groupby([<span class="st">"age"</span>,<span class="st">"is_male"</span>]).agg(<span class="st">"mean"</span>, numeric_only<span class="op">=</span><span class="va">True</span>).reset_index()</span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>votes_by_barbie.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">age</th>
+<th data-quarto-table-cell-role="th">is_male</th>
+<th data-quarto-table-cell-role="th">barbie</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>18</td>
+<td>False</td>
+<td>0.819594</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>18</td>
+<td>True</td>
+<td>0.667001</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>19</td>
+<td>False</td>
+<td>0.812214</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>19</td>
+<td>True</td>
+<td>0.661252</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>20</td>
+<td>False</td>
+<td>0.805281</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<div id="ee5ded0b" class="cell" data-execution_count="8">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="co"># A common matplotlib/seaborn pattern: create the figure and axes object, pass ax</span></span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="co"># to seaborn for drawing into, and later fine-tune the figure via ax.</span></span>
+<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>fig, ax <span class="op">=</span> plt.subplots()<span class="op">;</span></span>
+<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a>red_blue <span class="op">=</span> [<span class="st">"#bf1518"</span>, <span class="st">"#397eb7"</span>]</span>
+<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> sns.color_palette(red_blue):</span>
+<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a>    sns.pointplot(data<span class="op">=</span>votes_by_barbie, x <span class="op">=</span> <span class="st">"age"</span>, y <span class="op">=</span> <span class="st">"barbie"</span>, hue <span class="op">=</span> <span class="st">"is_male"</span>, ax<span class="op">=</span>ax)</span>
+<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a>new_ticks <span class="op">=</span> [i.get_text() <span class="cf">for</span> i <span class="kw">in</span> ax.get_xticklabels()]</span>
+<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a>ax.set_xticks(<span class="bu">range</span>(<span class="dv">0</span>, <span class="bu">len</span>(new_ticks), <span class="dv">10</span>), new_ticks[::<span class="dv">10</span>])</span>
+<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">"Preferences by Demographics"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="sampling_files/figure-html/cell-9-output-1.png" width="611" height="475" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<ul>
+<li>We see that retirees (in Berkeley) tend to watch Oppenheimer.</li>
+<li>We also see that residents who identify as non-male tend to prefer Barbie.</li>
+</ul>
+</section>
+<section id="simple-random-sample" class="level4" data-number="9.3.3.3">
+<h4 data-number="9.3.3.3" class="anchored" data-anchor-id="simple-random-sample"><span class="header-section-number">9.3.3.3</span> Simple Random Sample</h4>
+<p>Suppose we took a simple random sample (SRS) of the same size as our retiree sample:</p>
+<div id="52bbfd98" class="cell" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(convenience_sample)</span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>random_sample <span class="op">=</span> movie.sample(n, replace <span class="op">=</span> <span class="va">False</span>) <span class="co">## By default, replace = False</span></span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>np.mean(random_sample[<span class="st">"barbie"</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<pre><code>np.float64(0.5306514262818729)</code></pre>
+</div>
+</div>
+<p>This is very close to the actual vote of 0.5302792307692308!</p>
+<p>It turns out that we can get similar results with a <strong>much smaller sample size</strong>, say, 800:</p>
+<div id="0c8daeda" class="cell" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="dv">800</span></span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>random_sample <span class="op">=</span> movie.sample(n, replace <span class="op">=</span> <span class="va">False</span>)</span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Compute the sample average and the resulting relative error</span></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>sample_barbie <span class="op">=</span> np.mean(random_sample[<span class="st">"barbie"</span>])</span>
+<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a>err <span class="op">=</span> <span class="bu">abs</span>(sample_barbie<span class="op">-</span>actual_barbie)<span class="op">/</span>actual_barbie</span>
+<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a><span class="co"># We can print output with Markdown formatting too...</span></span>
+<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> IPython.display <span class="im">import</span> Markdown</span>
+<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a>Markdown(<span class="ss">f"**Actual** = </span><span class="sc">{</span>actual_barbie<span class="sc">:.4f}</span><span class="ss">, **Sample** = </span><span class="sc">{</span>sample_barbie<span class="sc">:.4f}</span><span class="ss">, "</span></span>
+<span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a>         <span class="ss">f"**Err** = </span><span class="sc">{</span><span class="dv">100</span><span class="op">*</span>err<span class="sc">:.2f}</span><span class="ss">%."</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display cell-output-markdown" data-execution_count="10">
+<p><strong>Actual</strong> = 0.5303, <strong>Sample</strong> = 0.5300, <strong>Err</strong> = 0.05%.</p>
+</div>
+</div>
+<p>We’ll learn how to choose this number when we (re)learn the Central Limit Theorem later in the semester.</p>
+</section>
+<section id="quantifying-chance-error" class="level4" data-number="9.3.3.4">
+<h4 data-number="9.3.3.4" class="anchored" data-anchor-id="quantifying-chance-error"><span class="header-section-number">9.3.3.4</span> Quantifying Chance Error</h4>
+<p>In our SRS of size 800, what would be our chance error?</p>
+<p>Let’s simulate 1000 versions of taking the 800-sized SRS from before:</p>
+<div id="27565203" class="cell" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>nrep <span class="op">=</span> <span class="dv">1000</span>   <span class="co"># number of simulations</span></span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="dv">800</span>       <span class="co"># size of our sample</span></span>
+<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a>poll_result <span class="op">=</span> []</span>
+<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">0</span>, nrep):</span>
+<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a>    random_sample <span class="op">=</span> movie.sample(n, replace <span class="op">=</span> <span class="va">False</span>)</span>
+<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a>    poll_result.append(np.mean(random_sample[<span class="st">"barbie"</span>]))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<div id="50dddcea" class="cell" data-execution_count="12">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>fig, ax <span class="op">=</span> plt.subplots()</span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>sns.histplot(poll_result, stat<span class="op">=</span><span class="st">'density'</span>, ax<span class="op">=</span>ax)</span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>ax.axvline(actual_barbie, color<span class="op">=</span><span class="st">"orange"</span>, lw<span class="op">=</span><span class="dv">4</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="sampling_files/figure-html/cell-13-output-1.png" width="605" height="421" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>What fraction of these simulated samples would have predicted Barbie?</p>
+<div id="339f5c8e" class="cell" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>poll_result <span class="op">=</span> pd.Series(poll_result)</span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>np.<span class="bu">sum</span>(poll_result <span class="op">&gt;</span> <span class="fl">0.5</span>)<span class="op">/</span><span class="dv">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="13">
+<pre><code>np.float64(0.949)</code></pre>
+</div>
+</div>
+<p>You can see the curve looks roughly Gaussian/normal. Using KDE:</p>
+<div id="e82829d6" class="cell" data-execution_count="14">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>sns.histplot(poll_result, stat<span class="op">=</span><span class="st">'density'</span>, kde<span class="op">=</span><span class="va">True</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="sampling_files/figure-html/cell-15-output-1.png" width="605" height="421" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="summary" class="level2" data-number="9.4">
+<h2 data-number="9.4" class="anchored" data-anchor-id="summary"><span class="header-section-number">9.4</span> Summary</h2>
+<p>Understanding the sampling process is what lets us go from describing the data to understanding the world. Without knowing / assuming something about how the data were collected, there is no connection between the sample and the population. Ultimately, the dataset doesn’t tell us about the world behind the data.</p>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../visualization_2/visualization_2.html" class="pagination-link" aria-label="Visualization II">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../intro_to_modeling/intro_to_modeling.html" class="pagination-link" aria-label="Introduction to Modeling">
+        <span class="nav-page-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/sampling/sampling_files/figure-html/cell-13-output-1.png b/docs/sampling/sampling_files/figure-html/cell-13-output-1.png
new file mode 100644
index 000000000..e440ef201
Binary files /dev/null and b/docs/sampling/sampling_files/figure-html/cell-13-output-1.png differ
diff --git a/docs/sampling/sampling_files/figure-html/cell-15-output-1.png b/docs/sampling/sampling_files/figure-html/cell-15-output-1.png
new file mode 100644
index 000000000..116aa7aa3
Binary files /dev/null and b/docs/sampling/sampling_files/figure-html/cell-15-output-1.png differ
diff --git a/docs/sampling/sampling_files/figure-html/cell-9-output-1.png b/docs/sampling/sampling_files/figure-html/cell-9-output-1.png
new file mode 100644
index 000000000..3bc39a5cd
Binary files /dev/null and b/docs/sampling/sampling_files/figure-html/cell-9-output-1.png differ
diff --git a/docs/search.json b/docs/search.json
deleted file mode 100644
index ed56e2191..000000000
--- a/docs/search.json
+++ /dev/null
@@ -1,52 +0,0 @@
-[
-  {
-    "objectID": "index.html",
-    "href": "index.html",
-    "title": "Principles and Techniques of Data Science",
-    "section": "",
-    "text": "Welcome",
-    "crumbs": [
-      "Welcome"
-    ]
-  },
-  {
-    "objectID": "index.html#about-the-course-notes",
-    "href": "index.html#about-the-course-notes",
-    "title": "Principles and Techniques of Data Science",
-    "section": "About the Course Notes",
-    "text": "About the Course Notes\nThis text offers supplementary resources to accompany lectures presented in the Fall 2024 Edition of the UC Berkeley course Data 100: Principles and Techniques of Data Science.\nNew notes will be added each week to accompany live lectures. See the full calendar of lectures on the course website.\nIf you spot any typos or would like to suggest any changes, please email us at data100.instructors@berkeley.edu.",
-    "crumbs": [
-      "Welcome"
-    ]
-  },
-  {
-    "objectID": "intro_lec/introduction.html",
-    "href": "intro_lec/introduction.html",
-    "title": "1  Introduction",
-    "section": "",
-    "text": "1.1 Data Science Lifecycle\nThe data science lifecycle is a high-level overview of the data science workflow. It’s a cycle of stages that a data scientist should explore as they conduct a thorough analysis of a data-driven problem.\nThere are many variations of the key ideas present in the data science lifecycle. In Data 100, we visualize the stages of the lifecycle using a flow diagram. Notice how there are two entry points.",
-    "crumbs": [
-      "<span class='chapter-number'>1</span>  <span class='chapter-title'>Introduction</span>"
-    ]
-  },
-  {
-    "objectID": "intro_lec/introduction.html#data-science-lifecycle",
-    "href": "intro_lec/introduction.html#data-science-lifecycle",
-    "title": "1  Introduction",
-    "section": "",
-    "text": "1.1.1 Ask a Question\nWhether by curiosity or necessity, data scientists constantly ask questions. For example, in the business world, data scientists may be interested in predicting the profit generated by a certain investment. In the field of medicine, they may ask whether some patients are more likely than others to benefit from a treatment.\nPosing questions is one of the primary ways the data science lifecycle begins. It helps to fully define the question. Here are some things you should ask yourself before framing a question.\n\nWhat do we want to know?\n\nA question that is too ambiguous may lead to confusion.\n\nWhat problems are we trying to solve?\n\nThe goal of asking a question should be clear in order to justify your efforts to stakeholders.\n\nWhat are the hypotheses we want to test?\n\nThis gives a clear perspective from which to analyze final results.\n\nWhat are the metrics for our success?\n\nThis establishes a clear point to know when to conclude the project.\n\n\n\n\n\n\n\n1.1.2 Obtain Data\nThe second entry point to the lifecycle is by obtaining data. A careful analysis of any problem requires the use of data. Data may be readily available to us, or we may have to embark on a process to collect it. When doing so, it is crucial to ask the following:\n\nWhat data do we have, and what data do we need?\n\nDefine the units of the data (people, cities, points in time, etc.) and what features to measure.\n\nHow will we sample more data?\n\nScrape the web, collect manually, run experiments, etc.\n\nIs our data representative of the population we want to study?\n\nIf our data is not representative of our population of interest, then we can come to incorrect conclusions.\n\n\nKey procedures: data acquisition, data cleaning\n\n\n\n\n\n1.1.3 Understand the Data\nRaw data itself is not inherently useful. It’s impossible to discern all the patterns and relationships between variables without carefully investigating them. Therefore, translating pure data into actionable insights is a key job of a data scientist. For example, we may choose to ask:\n\nHow is our data organized, and what does it contain?\n\nKnowing what the data says about the world helps us better understand the world.\n\nDo we have relevant data?\n\nIf the data we have collected is not useful to the question at hand, then we must collect more data.\n\nWhat are the biases, anomalies, or other issues with the data?\n\nThese can lead to many false conclusions if ignored, so data scientists must always be aware of these issues.\n\nHow do we transform the data to enable effective analysis?\n\nData is not always easy to interpret at first glance, so a data scientist should strive to reveal the hidden insights.\n\n\nKey procedures: exploratory data analysis, data visualization.\n\n\n\n\n\n1.1.4 Understand the World\nAfter observing the patterns in our data, we can begin answering our questions. This may require that we predict a quantity (machine learning) or measure the effect of some treatment (inference).\nFrom here, we may choose to report our results, or possibly conduct more analysis. We may not be satisfied with our findings, or our initial exploration may have brought up new questions that require new data.\n\nWhat does the data say about the world?\n\nGiven our models, the data will lead us to certain conclusions about the real world.\n\n\nDoes it answer our questions or accurately solve the problem?\n\nIf our model and data can not accomplish our goals, then we must reform our question, model, or both.\n\n\nHow robust are our conclusions and can we trust the predictions?\n\nInaccurate models can lead to false conclusions.\n\n\nKey procedures: model creation, prediction, inference.",
-    "crumbs": [
-      "<span class='chapter-number'>1</span>  <span class='chapter-title'>Introduction</span>"
-    ]
-  },
-  {
-    "objectID": "intro_lec/introduction.html#conclusion",
-    "href": "intro_lec/introduction.html#conclusion",
-    "title": "1  Introduction",
-    "section": "1.2 Conclusion",
-    "text": "1.2 Conclusion\nThe data science lifecycle is meant to be a set of general guidelines rather than a hard set of requirements. In our journey exploring the lifecycle, we’ll cover both the underlying theory and technologies used in data science. By the end of the course, we hope that you start to see yourself as a data scientist.\nWith that, we’ll begin by introducing one of the most important tools in exploratory data analysis: pandas.",
-    "crumbs": [
-      "<span class='chapter-number'>1</span>  <span class='chapter-title'>Introduction</span>"
-    ]
-  }
-]
\ No newline at end of file
diff --git a/docs/sql_I/images/data_storage.png b/docs/sql_I/images/data_storage.png
new file mode 100644
index 000000000..0a4bdc8ad
Binary files /dev/null and b/docs/sql_I/images/data_storage.png differ
diff --git a/docs/sql_I/images/sql_terminology.png b/docs/sql_I/images/sql_terminology.png
new file mode 100644
index 000000000..abf03043e
Binary files /dev/null and b/docs/sql_I/images/sql_terminology.png differ
diff --git a/docs/sql_I/sql_I.html b/docs/sql_I/sql_I.html
new file mode 100644
index 000000000..784c66060
--- /dev/null
+++ b/docs/sql_I/sql_I.html
@@ -0,0 +1,1757 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>20&nbsp; SQL I – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../sql_II/sql_II.html" rel="next">
+<link href="../inference_causality/inference_causality.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../sql_I/sql_I.html"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">SQL I</h2>
+   
+  <ul>
+  <li><a href="#databases" id="toc-databases" class="nav-link active" data-scroll-target="#databases"><span class="header-section-number">20.1</span> Databases</a></li>
+  <li><a href="#intro-to-sql" id="toc-intro-to-sql" class="nav-link" data-scroll-target="#intro-to-sql"><span class="header-section-number">20.2</span> Intro to SQL</a></li>
+  <li><a href="#tables-and-schema" id="toc-tables-and-schema" class="nav-link" data-scroll-target="#tables-and-schema"><span class="header-section-number">20.3</span> Tables and Schema</a>
+  <ul>
+  <li><a href="#primary-keys" id="toc-primary-keys" class="nav-link" data-scroll-target="#primary-keys"><span class="header-section-number">20.3.1</span> Primary Keys</a></li>
+  <li><a href="#foreign-keys" id="toc-foreign-keys" class="nav-link" data-scroll-target="#foreign-keys"><span class="header-section-number">20.3.2</span> Foreign Keys</a></li>
+  </ul></li>
+  <li><a href="#basic-queries" id="toc-basic-queries" class="nav-link" data-scroll-target="#basic-queries"><span class="header-section-number">20.4</span> Basic Queries</a>
+  <ul>
+  <li><a href="#selecting-from-tables" id="toc-selecting-from-tables" class="nav-link" data-scroll-target="#selecting-from-tables"><span class="header-section-number">20.4.1</span> <code>SELECT</code>ing From Tables</a>
+  <ul>
+  <li><a href="#sql-style-conventions" id="toc-sql-style-conventions" class="nav-link" data-scroll-target="#sql-style-conventions"><span class="header-section-number">20.4.1.1</span> SQL Style Conventions</a></li>
+  <li><a href="#aliasing-with-as" id="toc-aliasing-with-as" class="nav-link" data-scroll-target="#aliasing-with-as"><span class="header-section-number">20.4.1.2</span> Aliasing with <code>AS</code></a></li>
+  <li><a href="#uniqueness-with-distinct" id="toc-uniqueness-with-distinct" class="nav-link" data-scroll-target="#uniqueness-with-distinct"><span class="header-section-number">20.4.1.3</span> Uniqueness with <code>DISTINCT</code></a></li>
+  </ul></li>
+  <li><a href="#applying-where-conditions" id="toc-applying-where-conditions" class="nav-link" data-scroll-target="#applying-where-conditions"><span class="header-section-number">20.4.2</span> Applying <code>WHERE</code> Conditions</a>
+  <ul>
+  <li><a href="#strings-in-sql" id="toc-strings-in-sql" class="nav-link" data-scroll-target="#strings-in-sql"><span class="header-section-number">20.4.2.1</span> Strings in SQL</a></li>
+  <li><a href="#where-with-null-values" id="toc-where-with-null-values" class="nav-link" data-scroll-target="#where-with-null-values"><span class="header-section-number">20.4.2.2</span> <code>WHERE</code> WITH <code>NULL</code> Values</a></li>
+  </ul></li>
+  <li><a href="#sorting-and-restricting-output" id="toc-sorting-and-restricting-output" class="nav-link" data-scroll-target="#sorting-and-restricting-output"><span class="header-section-number">20.4.3</span> Sorting and Restricting Output</a>
+  <ul>
+  <li><a href="#sorting-with-order-by" id="toc-sorting-with-order-by" class="nav-link" data-scroll-target="#sorting-with-order-by"><span class="header-section-number">20.4.3.1</span> Sorting with <code>ORDER BY</code></a></li>
+  <li><a href="#limit-vs.-offset" id="toc-limit-vs.-offset" class="nav-link" data-scroll-target="#limit-vs.-offset"><span class="header-section-number">20.4.3.2</span> <code>LIMIT</code> vs.&nbsp;<code>OFFSET</code></a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#summary" id="toc-summary" class="nav-link" data-scroll-target="#summary"><span class="header-section-number">20.5</span> Summary</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Recognizing situations where we need “bigger” tools for manipulating data</li>
+<li>Write basic SQL queries using <code>SELECT</code>, <code>FROM</code>, <code>WHERE</code>, <code>ORDER BY</code>, <code>LIMIT</code>, and <code>OFFSET</code></li>
+<li>Perform aggregations using <code>GROUP BY</code></li>
+</ul>
+</div>
+</div>
+</div>
+<p>So far in the course, we have made our way through the entire data science lifecycle: we learned how to load and explore a dataset, formulate questions, and use the tools of prediction and inference to come up with answers. For the remaining weeks of the semester, we are going to make a second pass through the lifecycle, this time with a different set of tools, ideas, and abstractions.</p>
+<section id="databases" class="level2" data-number="20.1">
+<h2 data-number="20.1" class="anchored" data-anchor-id="databases"><span class="header-section-number">20.1</span> Databases</h2>
+<p>With this goal in mind, let’s go back to the very beginning of the lifecycle. We first started our work in data analysis by looking at the <code>pandas</code> library, which offered us powerful tools to manipulate tabular data stored in (primarily) CSV files. CSVs work well when analyzing relatively small datasets (less than 10GB) that don’t need to be shared across many users. In research and industry, however, data scientists often need to access enormous bodies of data that cannot be easily stored in a CSV format. Collaborating with others when working with CSVs can also be tricky —— a real-world data scientist may run into problems when multiple users try to make modifications or more dire security issues arise regarding who should and should not have access to the data.</p>
+<p>A <strong>database</strong> is a large, organized collection of data. Databases are administered by <strong>Database Management Systems (DBMS)</strong>, which are software systems that store, manage, and facilitate access to one or more databases. Databases help mitigate many of the issues that come with using CSVs for data storage: they provide reliable storage that can survive system crashes or disk failures, are optimized to compute on data that does not fit into memory, and contain special data structures to improve performance. Using databases rather than CSVs offers further benefits from the standpoint of data management. A DBMS can apply settings that configure how data is organized, block certain data anomalies (for example, enforcing non-negative weights or ages), and determine who is allowed access to the data. It can also ensure safe concurrent operations where multiple users reading and writing to the database will not lead to fatal errors. Below, you can see the functionality of the different types of data storage and management architectures. In data science, common large-scale DBMS systems used are Google BigQuery, Amazon Redshift, Snowflake, Databricks, Microsoft SQL Server, and more. To learn more about these, consider taking <a href="https://www.data101.org/sp24/">Data 101</a>!</p>
+<p style="text-align:center">
+<img src="images/data_storage.png" width="700px">
+</p>
+<p>As you may have guessed, we can’t use our usual <code>pandas</code> methods to work with data in a database. Instead, we’ll turn to Structured Query Language.</p>
+</section>
+<section id="intro-to-sql" class="level2" data-number="20.2">
+<h2 data-number="20.2" class="anchored" data-anchor-id="intro-to-sql"><span class="header-section-number">20.2</span> Intro to SQL</h2>
+<p><strong>Structured Query Language</strong>, or <strong>SQL</strong> (commonly pronounced “sequel,” though this is the subject of <a href="https://patorjk.com/blog/2012/01/26/pronouncing-sql-s-q-l-or-sequel/">fierce debate</a>), is a special programming language designed to communicate with databases, and it is the dominant language/technology for working with data. You may have encountered it in classes like CS 61A or Data C88C before, and you likely will encounter it in the future. It is a language of tables: all inputs and outputs are tables. Unlike Python, it is a <strong>declarative programming language</strong> – this means that rather than writing the exact logic needed to complete a task, a piece of SQL code “declares” what the desired final output should be and leaves the program to determine what logic should be implemented. This logic differs depending on the SQL code itself or on the system it’s running on (ie. <a href="https://www.mongodb.com/">MongoDB</a>, <a href="https://www.sqlite.org/">SQLite</a>, <a href="https://duckdb.org/">DuckDB</a>, etc.). Most systems don’t follow the standards, and every system you work with will be a little different.</p>
+<p>For the purposes of Data 100, we use SQLite or DuckDB. SQLite is an easy-to-use library that allows users to directly manipulate a database file or an in-memory database with a simplified version of SQL. It’s commonly used to store data for small apps on mobile devices and is optimized for simplicity and speed of simple data tasks. DuckDB is an easy-to-use library that lets you directly manipulate a database file, collection of table formatted files (e.g., CSV), or in-memory <code>pandas</code> <code>DataFrame</code>s using a more complete version of SQL. It’s optimized for simplicity and speed of advanced data analysis tasks and is becoming increasingly popular for data analysis tasks on large datasets.</p>
+<p>It is important to reiterate that SQL is an entirely different language from Python. However, Python <em>does</em> have special engines that allow us to run SQL code in a Jupyter notebook. While this is typically not how SQL is used outside of an educational setting, we will use this workflow to illustrate how SQL queries are constructed using the tools we’ve already worked with this semester. You will learn more about how to run SQL queries in Jupyter in an upcoming lab and homework.</p>
+<p>The syntax below will seem unfamiliar to you; for now, just focus on understanding the output displayed. We will clarify the SQL code in a bit.</p>
+<p>To start, we’ll look at a database called <code>example_duck.db</code> and connect to it using DuckDB.</p>
+<div id="a17500fb" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Load the SQL Alchemy Python library and DuckDB</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sqlalchemy</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> duckdb</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="f2fae026" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Load %%sql cell magic</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>load_ext sql</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<div id="e139c121" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Connect to the database</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>sql duckdb:<span class="op">///</span>data<span class="op">/</span>example_duck.db <span class="op">--</span>alias duck</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Now that we’re connected, let’s make some queries!</p>
+<div id="4717ef78" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span> FROM Dragon<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">cute</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>Thanks to the <code>pandas</code> magic, the resulting return data is displayed in a format almost identical to our <code>pandas</code> tables but without an index.</p>
+</section>
+<section id="tables-and-schema" class="level2" data-number="20.3">
+<h2 data-number="20.3" class="anchored" data-anchor-id="tables-and-schema"><span class="header-section-number">20.3</span> Tables and Schema</h2>
+<p style="text-align:center">
+<img src="images/sql_terminology.png" width="700px">
+</p>
+<p>Looking at the <code>Dragon</code> table above, we can see that it contains contains three columns. The first of these, <code>"name"</code>, contains text data. The <code>"year"</code> column contains integer data, with the constraint that year values must be greater than or equal to 2000. The final column, <code>"cute"</code>, contains integer data with no restrictions on allowable values.</p>
+<p>Now, let’s look at the <strong>schema</strong> of our database. A schema describes the logical structure of a table. Whenever a new table is created, the creator must declare its schema.</p>
+<div id="ca7c55b0" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span> </span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>FROM sqlite_master</span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>WHERE <span class="bu">type</span><span class="op">=</span><span class="st">'table'</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">type</th>
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">tbl_name</th>
+<th data-quarto-table-cell-role="th">rootpage</th>
+<th data-quarto-table-cell-role="th">sql</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>The summary above displays information about the database; it contains four tables named <code>sqlite_sequence</code>, <code>Dragon</code>, <code>Dish</code>, and <code>Scene</code>. The rightmost column above lists the command that was used to construct each table.</p>
+<p>Let’s look more closely at the command used to create the <code>Dragon</code> table (the second entry above).</p>
+<pre><code>CREATE TABLE Dragon (name TEXT PRIMARY KEY,
+                     year INTEGER CHECK (year &gt;= 2000),
+                     cute INTEGER)</code></pre>
+<p>The statement <code>CREATE TABLE</code> is used to specify the <strong>schema</strong> of the table – a description of what logic is used to organize the table. Schema follows a set format:</p>
+<ul>
+<li><p><code>ColName</code>: the name of a column</p></li>
+<li><p><code>DataType</code>: the type of data to be stored in a column. Some of the most common SQL data types are:</p>
+<ul>
+<li><code>INT</code> (integers)</li>
+<li><code>FLOAT</code> (floating point numbers)</li>
+<li><code>TEXT</code> (strings)</li>
+<li><code>BLOB</code> (arbitrary data, such as audio/video files)</li>
+<li><code>DATETIME</code> (a date and time)</li>
+</ul></li>
+<li><p><code>Constraint</code>: some restriction on the data to be stored in the column. Common constraints are:</p>
+<ul>
+<li><code>CHECK</code> (data must obey a certain condition)</li>
+<li><code>PRIMARY KEY</code> (designate a column as the table’s primary key)</li>
+<li><code>NOT NULL</code> (data cannot be null)</li>
+<li><code>DEFAULT</code> (a default fill value if no specific entry is given)</li>
+</ul></li>
+</ul>
+<p>Note that different implementations of SQL (e.g., <a href="https://duckdb.org/docs/sql/data_types/overview.html">DuckDB</a>, <a href="https://www.sqlite.org/datatype3.html">SQLite</a>, <a href="https://dev.mysql.com/doc/refman/8.0/en/data-types.html">MySQL</a>) will support different types. In Data 100, we’ll primarily use DuckDB.</p>
+<p>Database tables (also referred to as <strong>relations</strong>) are structured much like <code>DataFrame</code>s in <code>pandas</code>. Each row, sometimes called a <strong>tuple</strong>, represents a single record in the dataset. Each column, sometimes called an <strong>attribute</strong> or <strong>field</strong>, describes some feature of the record.</p>
+<section id="primary-keys" class="level3" data-number="20.3.1">
+<h3 data-number="20.3.1" class="anchored" data-anchor-id="primary-keys"><span class="header-section-number">20.3.1</span> Primary Keys</h3>
+<p>The <strong>primary key</strong> is a set of column(s) that uniquely identify each record in the table. In the <code>Dragon</code> table, the <code>"name"</code> column is its primary key that <em>uniquely</em> identifies each entry in the table. Because <code>"name"</code> is the primary key of the table, no two entries in the table can have the same name – a given value of <code>"name"</code> is unique to each dragon. Primary keys are used to ensure data integrity and to optimize data access.</p>
+</section>
+<section id="foreign-keys" class="level3" data-number="20.3.2">
+<h3 data-number="20.3.2" class="anchored" data-anchor-id="foreign-keys"><span class="header-section-number">20.3.2</span> Foreign Keys</h3>
+<p>A foreign key is a column or set of columns that references a <em>primary key in another table</em>. A foreign key constraint ensures that a primary key exists in the referenced table. For example, let’s say we have 2 tables, <code>student</code> and <code>assignment</code>, with the following schemas:</p>
+<pre><code>CREATE TABLE student (
+    student_id INTEGER PRIMARY KEY,
+    name VARCHAR,
+    email VARCHAR
+);
+
+CREATE TABLE assignment (
+    assignment_id INTEGER PRIMARY KEY,
+    description VARCHAR
+);</code></pre>
+<p>Note that each table has a primary key that uniquely identifies each student and assignment.</p>
+<p>Say we want to create the table <code>grade</code> to store the score each student got on each assignment. Naturally, this will depend on the information in <code>student</code> and <code>assignment</code>; we should not be saving the grade for a nonexisistent student nor a nonexisistent assignment. Hence, we can create the columns <code>student_id</code> and <code>assignment_id</code> that reference foreign tables <code>student</code> and <code>assignment</code>, respectively. This way, we ensure that the data in <code>grade</code> is always up-to-date with the other tables.</p>
+<pre><code>CREATE TABLE grade (
+    student_id INTEGER,
+    assignment_id INTEGER,
+    score REAL,
+    FOREIGN KEY (student_id) REFERENCES student(student_id),
+    FOREIGN KEY (assignment_id) REFERENCES assignment(assignment_id)
+);</code></pre>
+</section>
+</section>
+<section id="basic-queries" class="level2" data-number="20.4">
+<h2 data-number="20.4" class="anchored" data-anchor-id="basic-queries"><span class="header-section-number">20.4</span> Basic Queries</h2>
+<p>To extract and manipulate data stored in a SQL table, we will need to familiarize ourselves with the syntax to write pieces of SQL code, which we call <strong>queries</strong>.</p>
+<section id="selecting-from-tables" class="level3" data-number="20.4.1">
+<h3 data-number="20.4.1" class="anchored" data-anchor-id="selecting-from-tables"><span class="header-section-number">20.4.1</span> <code>SELECT</code>ing From Tables</h3>
+<p>The basic unit of a SQL query is the <code>SELECT</code> statement. <code>SELECT</code> specifies what columns we would like to extract from a given table. We use <code>FROM</code> to tell SQL the table from which we want to <code>SELECT</code> our data.</p>
+<div id="8fe5b09d" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">cute</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>In SQL, <code>*</code> means “everything.” The query above grabs <em>all</em> the columns in <code>Dragon</code> and displays them in the outputted table. We can also specify a specific subset of columns to be <code>SELECT</code>ed.&nbsp;Notice that the outputted columns appear in the order they were <code>SELECT</code>ed.</p>
+<div id="1e25acf4" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>SELECT cute, year</span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">cute</th>
+<th data-quarto-table-cell-role="th">year</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p><strong>Every</strong> SQL query must include both a <code>SELECT</code> and <code>FROM</code> statement. Intuitively, this makes sense —— we know that we’ll want to extract some piece of information from the table; to do so, we also need to indicate what table we want to consider.</p>
+<p>It is important to note that SQL enforces a strict “order of operations” —— SQL clauses must <em>always</em> follow the same sequence. For example, the <code>SELECT</code> statement must always precede <code>FROM</code>. This means that any SQL query will follow the same structure.</p>
+<pre><code>SELECT &lt;column list&gt;
+FROM &lt;table&gt;
+[additional clauses]</code></pre>
+<p>The additional clauses we use depend on the specific task we’re trying to achieve. We may refine our query to filter on a certain condition, aggregate a particular column, or join several tables together. We will spend the rest of this note outlining some useful clauses to build up our understanding of the order of operations.</p>
+<section id="sql-style-conventions" class="level4" data-number="20.4.1.1">
+<h4 data-number="20.4.1.1" class="anchored" data-anchor-id="sql-style-conventions"><span class="header-section-number">20.4.1.1</span> SQL Style Conventions</h4>
+<p>And just like that, we’ve already written two SQL queries. There are a few things to note in the queries above. Firstly, notice that every “verb” is written in uppercase. It is convention to write SQL operations in capital letters, but your code will run just fine even if you choose to keep things in lowercase. Second, the query above separates each statement with a new line. SQL queries are not impacted by whitespace within the query; this means that SQL code is typically written with a new line after each statement to make things more readable. The semicolon (<code>;</code>) indicates the end of a query. There are some “flavors” of SQL in which a query will not run if no semicolon is present; however, in Data 100, the SQL version we will use works with or without an ending semicolon. Queries in these notes will end with semicolons to build up good habits.</p>
+</section>
+<section id="aliasing-with-as" class="level4" data-number="20.4.1.2">
+<h4 data-number="20.4.1.2" class="anchored" data-anchor-id="aliasing-with-as"><span class="header-section-number">20.4.1.2</span> Aliasing with <code>AS</code></h4>
+<p>The <code>AS</code> keyword allows us to give a column a new name (called an <strong>alias</strong>) after it has been <code>SELECT</code>ed.&nbsp;The general syntax is:</p>
+<pre><code>SELECT column_in_input_table AS new_name_in_output_table</code></pre>
+<div id="1f5313c0" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>SELECT cute AS cuteness, year AS birth</span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">cuteness</th>
+<th data-quarto-table-cell-role="th">birth</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+</section>
+<section id="uniqueness-with-distinct" class="level4" data-number="20.4.1.3">
+<h4 data-number="20.4.1.3" class="anchored" data-anchor-id="uniqueness-with-distinct"><span class="header-section-number">20.4.1.3</span> Uniqueness with <code>DISTINCT</code></h4>
+<p>To <code>SELECT</code> only the <em>unique</em> values in a column, we use the <code>DISTINCT</code> keyword. This will cause any any duplicate entries in a column to be removed. If we want to find only the unique years in <code>Dragon</code>, without any repeats, we would write:</p>
+<div id="0ee189ca" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>SELECT DISTINCT year</span>
+<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">year</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+</section>
+</section>
+<section id="applying-where-conditions" class="level3" data-number="20.4.2">
+<h3 data-number="20.4.2" class="anchored" data-anchor-id="applying-where-conditions"><span class="header-section-number">20.4.2</span> Applying <code>WHERE</code> Conditions</h3>
+<p>The <code>WHERE</code> keyword is used to select only some rows of a table, filtered on a given Boolean condition.</p>
+<div id="53520d8e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>SELECT name, year</span>
+<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>WHERE cute <span class="op">&gt;</span> <span class="dv">0</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="10">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>We can add complexity to the <code>WHERE</code> condition using the keywords <code>AND</code>, <code>OR</code>, and <code>NOT</code>, much like we would in Python.</p>
+<div id="a89a8b3b" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>SELECT name, year</span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>WHERE cute <span class="op">&gt;</span> <span class="dv">0</span> OR year <span class="op">&gt;</span> <span class="dv">2013</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="11">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>To spare ourselves needing to write complicated logical expressions by combining several conditions, we can also filter for entries that are <code>IN</code> a specified list of values. This is similar to the use of <code>in</code> or <code>.isin</code> in Python.</p>
+<div id="da695155" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a>SELECT name, year</span>
+<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a>WHERE name IN (<span class="st">'hiccup'</span>, <span class="st">'puff'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="12">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<section id="strings-in-sql" class="level4" data-number="20.4.2.1">
+<h4 data-number="20.4.2.1" class="anchored" data-anchor-id="strings-in-sql"><span class="header-section-number">20.4.2.1</span> Strings in SQL</h4>
+<p>In <code>Python</code>, there is no distinction between double <code>""</code> and single quotes <code>''</code>. SQL, on the other hand, distinguishes double quotes <code>""</code> as <em>column names</em> and single quotes <code>''</code> as <em>strings</em>. For example, we can make the call</p>
+<pre><code>SELECT "birth weight"
+FROM patient
+WHERE "first name" = 'Joey'</code></pre>
+<p>to select the column <code>"birth weight"</code> from the <code>patient</code> table and only select rows where the column <code>"first name"</code> is equal to <code>'Joey'</code>.</p>
+</section>
+<section id="where-with-null-values" class="level4" data-number="20.4.2.2">
+<h4 data-number="20.4.2.2" class="anchored" data-anchor-id="where-with-null-values"><span class="header-section-number">20.4.2.2</span> <code>WHERE</code> WITH <code>NULL</code> Values</h4>
+<p>You may have noticed earlier that our table actually has a missing value. In SQL, missing data is given the special value <code>NULL</code>. <code>NULL</code> behaves in a fundamentally different way to other data types. We can’t use the typical operators (=, &gt;, and &lt;) on <code>NULL</code> values (in fact, <code>NULL == NULL</code> returns <code>False</code>!). Instead, we check to see if a value <code>IS</code> or <code>IS NOT</code> <code>NULL</code>.</p>
+<div id="fc8b8701" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>SELECT name, cute</span>
+<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a>WHERE cute IS NOT NULL<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="13">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">cute</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+</section>
+</section>
+<section id="sorting-and-restricting-output" class="level3" data-number="20.4.3">
+<h3 data-number="20.4.3" class="anchored" data-anchor-id="sorting-and-restricting-output"><span class="header-section-number">20.4.3</span> Sorting and Restricting Output</h3>
+<section id="sorting-with-order-by" class="level4" data-number="20.4.3.1">
+<h4 data-number="20.4.3.1" class="anchored" data-anchor-id="sorting-with-order-by"><span class="header-section-number">20.4.3.1</span> Sorting with <code>ORDER BY</code></h4>
+<p>What if we want the output table to appear in a certain order? The <code>ORDER BY</code> keyword behaves similarly to <code>.sort_values()</code> in <code>pandas</code>.</p>
+<div id="f4a3e654" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a>ORDER BY cute<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">cute</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>By default, <code>ORDER BY</code> will display results in ascending order (<code>ASC</code>) with the lowest values at the top of the table. To sort in descending order, we use the <code>DESC</code> keyword after specifying the column to be used for ordering.</p>
+<div id="593eb9f9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a>ORDER BY cute DESC<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="15">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">cute</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>We can also tell SQL to <code>ORDER BY</code> two columns at once. This will sort the table by the first listed column, then use the values in the second listed column to break any ties.</p>
+<div id="0868d773" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb34-4"><a href="#cb34-4" aria-hidden="true" tabindex="-1"></a>ORDER BY year, cute DESC<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="16">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">cute</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>Note that in this example, <code>year</code> is sorted in ascending order and <code>cute</code> in descending order. If you want <code>year</code> to be ordered in descending order as well, you need to specify <code>year DESC, cute DESC;</code>.</p>
+</section>
+<section id="limit-vs.-offset" class="level4" data-number="20.4.3.2">
+<h4 data-number="20.4.3.2" class="anchored" data-anchor-id="limit-vs.-offset"><span class="header-section-number">20.4.3.2</span> <code>LIMIT</code> vs.&nbsp;<code>OFFSET</code></h4>
+<p>In many instances, we are only concerned with a certain number of rows in the output table (for example, wanting to find the first two dragons in the table). The <code>LIMIT</code> keyword restricts the output to a specified number of rows. It serves a function similar to that of <code>.head()</code> in <code>pandas</code>.</p>
+<div id="bb095ad1" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb36-4"><a href="#cb36-4" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">2</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="17">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">cute</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>The <code>OFFSET</code> keyword indicates the index at which <code>LIMIT</code> should start. In other words, we can use <code>OFFSET</code> to shift where the <code>LIMIT</code>ing begins by a specified number of rows. For example, we might care about the dragons that are at positions 2 and 3 in the table.</p>
+<div id="815d5049" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb38-3"><a href="#cb38-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb38-4"><a href="#cb38-4" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">2</span></span>
+<span id="cb38-5"><a href="#cb38-5" aria-hidden="true" tabindex="-1"></a>OFFSET <span class="dv">1</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/example_duck.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="18">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">cute</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>With these keywords in hand, let’s update our SQL order of operations. Remember: <em>every</em> SQL query must list clauses in this order.</p>
+<pre><code>SELECT &lt;column expression list&gt;
+FROM &lt;table&gt;
+[WHERE &lt;predicate&gt;]
+[ORDER BY &lt;column list&gt;]
+[LIMIT &lt;number of rows&gt;]
+[OFFSET &lt;number of rows&gt;];</code></pre>
+</section>
+</section>
+</section>
+<section id="summary" class="level2" data-number="20.5">
+<h2 data-number="20.5" class="anchored" data-anchor-id="summary"><span class="header-section-number">20.5</span> Summary</h2>
+<p>Let’s summarize what we’ve learned so far. We know that <code>SELECT</code> and <code>FROM</code> are the fundamental building blocks of any SQL query. We can augment these two keywords with additional clauses to refine the data in our output table.</p>
+<p>Any clauses that we include must follow a strict ordering within the query:</p>
+<pre><code>SELECT &lt;column list&gt;
+FROM &lt;table&gt;
+[WHERE &lt;predicate&gt;]
+[ORDER BY &lt;column list&gt;]
+[LIMIT &lt;number of rows&gt;]
+[OFFSET &lt;number of rows&gt;]</code></pre>
+<p>Here, any clause contained in square brackets <code>[ ]</code> is optional —— we only need to use the keyword if it is relevant to the table operation we want to perform. Also note that by convention, we use all caps for keywords in SQL statements and use newlines to make code more readable.</p>
+
+
+<!-- -->
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../inference_causality/inference_causality.html" class="pagination-link" aria-label="Causal Inference and Confounding">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../sql_II/sql_II.html" class="pagination-link" aria-label="SQL II">
+        <span class="nav-page-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb42" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> SQL I</span></span>
+<span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb42-4"><a href="#cb42-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb42-5"><a href="#cb42-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb42-6"><a href="#cb42-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb42-7"><a href="#cb42-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: false</span></span>
+<span id="cb42-8"><a href="#cb42-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb42-9"><a href="#cb42-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb42-10"><a href="#cb42-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: SQL I</span></span>
+<span id="cb42-11"><a href="#cb42-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb42-12"><a href="#cb42-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb42-13"><a href="#cb42-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb42-14"><a href="#cb42-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb42-15"><a href="#cb42-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb42-16"><a href="#cb42-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb42-17"><a href="#cb42-17" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb42-18"><a href="#cb42-18" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb42-19"><a href="#cb42-19" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb42-20"><a href="#cb42-20" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb42-21"><a href="#cb42-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb42-22"><a href="#cb42-22" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb42-23"><a href="#cb42-23" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb42-24"><a href="#cb42-24" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb42-25"><a href="#cb42-25" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb42-26"><a href="#cb42-26" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb42-27"><a href="#cb42-27" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb42-28"><a href="#cb42-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-29"><a href="#cb42-29" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb42-30"><a href="#cb42-30" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb42-31"><a href="#cb42-31" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Recognizing situations where we need “bigger” tools for manipulating data</span>
+<span id="cb42-32"><a href="#cb42-32" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Write basic SQL queries using <span class="in">`SELECT`</span>, <span class="in">`FROM`</span>, <span class="in">`WHERE`</span>, <span class="in">`ORDER BY`</span>, <span class="in">`LIMIT`</span>, and <span class="in">`OFFSET`</span></span>
+<span id="cb42-33"><a href="#cb42-33" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Perform aggregations using <span class="in">`GROUP BY`</span></span>
+<span id="cb42-34"><a href="#cb42-34" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb42-35"><a href="#cb42-35" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-36"><a href="#cb42-36" aria-hidden="true" tabindex="-1"></a>So far in the course, we have made our way through the entire data science lifecycle: we learned how to load and explore a dataset, formulate questions, and use the tools of prediction and inference to come up with answers. For the remaining weeks of the semester, we are going to make a second pass through the lifecycle, this time with a different set of tools, ideas, and abstractions. </span>
+<span id="cb42-37"><a href="#cb42-37" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-38"><a href="#cb42-38" aria-hidden="true" tabindex="-1"></a><span class="fu">## Databases</span></span>
+<span id="cb42-39"><a href="#cb42-39" aria-hidden="true" tabindex="-1"></a>With this goal in mind, let's go back to the very beginning of the lifecycle. We first started our work in data analysis by looking at the <span class="in">`pandas`</span> library, which offered us powerful tools to manipulate tabular data stored in (primarily) CSV files. CSVs work well when analyzing relatively small datasets (less than 10GB) that don't need to be shared across many users. In research and industry, however, data scientists often need to access enormous bodies of data that cannot be easily stored in a CSV format. Collaborating with others when working with CSVs can also be tricky —— a real-world data scientist may run into problems when multiple users try to make modifications or more dire security issues arise regarding who should and should not have access to the data. </span>
+<span id="cb42-40"><a href="#cb42-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-41"><a href="#cb42-41" aria-hidden="true" tabindex="-1"></a>A **database** is a large, organized collection of data. Databases are administered by **Database Management Systems (DBMS)**, which are software systems that store, manage, and facilitate access to one or more databases. Databases help mitigate many of the issues that come with using CSVs for data storage: they provide reliable storage that can survive system crashes or disk failures, are optimized to compute on data that does not fit into memory, and contain special data structures to improve performance. Using databases rather than CSVs offers further benefits from the standpoint of data management. A DBMS can apply settings that configure how data is organized, block certain data anomalies (for example, enforcing non-negative weights or ages), and determine who is allowed access to the data. It can also ensure safe concurrent operations where multiple users reading and writing to the database will not lead to fatal errors. Below, you can see the functionality of the different types of data storage and management architectures. In data science, common large-scale DBMS systems used are Google BigQuery, Amazon Redshift, Snowflake, Databricks, Microsoft SQL Server, and more. To learn more about these, consider taking <span class="co">[</span><span class="ot">Data 101</span><span class="co">](https://www.data101.org/sp24/)</span>!</span>
+<span id="cb42-42"><a href="#cb42-42" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-43"><a href="#cb42-43" aria-hidden="true" tabindex="-1"></a>&lt;p style="text-align:center"&gt;</span>
+<span id="cb42-44"><a href="#cb42-44" aria-hidden="true" tabindex="-1"></a>    &lt;img src='images/data_storage.png' width="700px" /&gt;</span>
+<span id="cb42-45"><a href="#cb42-45" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb42-46"><a href="#cb42-46" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-47"><a href="#cb42-47" aria-hidden="true" tabindex="-1"></a>As you may have guessed, we can't use our usual <span class="in">`pandas`</span> methods to work with data in a database. Instead, we'll turn to Structured Query Language.</span>
+<span id="cb42-48"><a href="#cb42-48" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-49"><a href="#cb42-49" aria-hidden="true" tabindex="-1"></a><span class="fu">## Intro to SQL</span></span>
+<span id="cb42-50"><a href="#cb42-50" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-51"><a href="#cb42-51" aria-hidden="true" tabindex="-1"></a>**Structured Query Language**, or **SQL** (commonly pronounced "sequel," though this is the subject of [fierce debate](https://patorjk.com/blog/2012/01/26/pronouncing-sql-s-q-l-or-sequel/)), is a special programming language designed to communicate with databases, and it is the dominant language/technology for working with data. You may have encountered it in classes like CS 61A or Data C88C before, and you likely will encounter it in the future. It is a language of tables: all inputs and outputs are tables. Unlike Python, it is a **declarative programming language** – this means that rather than writing the exact logic needed to complete a task, a piece of SQL code "declares" what the desired final output should be and leaves the program to determine what logic should be implemented. This logic differs depending on the SQL code itself or on the system it's running on (ie. <span class="co">[</span><span class="ot">MongoDB</span><span class="co">](https://www.mongodb.com/)</span>, <span class="co">[</span><span class="ot">SQLite</span><span class="co">](https://www.sqlite.org/)</span>, <span class="co">[</span><span class="ot">DuckDB</span><span class="co">](https://duckdb.org/)</span>, etc.). Most systems don’t follow the standards, and every system you work with will be a little different. </span>
+<span id="cb42-52"><a href="#cb42-52" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-53"><a href="#cb42-53" aria-hidden="true" tabindex="-1"></a>For the purposes of Data 100, we use SQLite or DuckDB. SQLite is an easy-to-use library that allows users to directly manipulate a database file or an in-memory database with a simplified version of SQL. It's commonly used to store data for small apps on mobile devices and is optimized for simplicity and speed of simple data tasks. DuckDB is an easy-to-use library that lets you directly manipulate a database file, collection of table formatted files (e.g., CSV), or in-memory <span class="in">`pandas`</span> <span class="in">`DataFrame`</span>s using a more complete version of SQL. It's optimized for simplicity and speed of advanced data analysis tasks and is becoming increasingly popular for data analysis tasks on large datasets.</span>
+<span id="cb42-54"><a href="#cb42-54" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-55"><a href="#cb42-55" aria-hidden="true" tabindex="-1"></a>It is important to reiterate that SQL is an entirely different language from Python. However, Python *does* have special engines that allow us to run SQL code in a Jupyter notebook. While this is typically not how SQL is used outside of an educational setting, we will use this workflow to illustrate how SQL queries are constructed using the tools we've already worked with this semester. You will learn more about how to run SQL queries in Jupyter in an upcoming lab and homework.</span>
+<span id="cb42-56"><a href="#cb42-56" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-57"><a href="#cb42-57" aria-hidden="true" tabindex="-1"></a>The syntax below will seem unfamiliar to you; for now, just focus on understanding the output displayed. We will clarify the SQL code in a bit.</span>
+<span id="cb42-58"><a href="#cb42-58" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-59"><a href="#cb42-59" aria-hidden="true" tabindex="-1"></a>To start, we'll look at a database called <span class="in">`example_duck.db`</span> and connect to it using DuckDB.</span>
+<span id="cb42-60"><a href="#cb42-60" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-63"><a href="#cb42-63" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-64"><a href="#cb42-64" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb42-65"><a href="#cb42-65" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-66"><a href="#cb42-66" aria-hidden="true" tabindex="-1"></a><span class="co"># Load the SQL Alchemy Python library and DuckDB</span></span>
+<span id="cb42-67"><a href="#cb42-67" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sqlalchemy</span>
+<span id="cb42-68"><a href="#cb42-68" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> duckdb</span>
+<span id="cb42-69"><a href="#cb42-69" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-70"><a href="#cb42-70" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-73"><a href="#cb42-73" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-74"><a href="#cb42-74" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-75"><a href="#cb42-75" aria-hidden="true" tabindex="-1"></a><span class="co"># Load %%sql cell magic</span></span>
+<span id="cb42-76"><a href="#cb42-76" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>load_ext sql</span>
+<span id="cb42-77"><a href="#cb42-77" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-78"><a href="#cb42-78" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-81"><a href="#cb42-81" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-82"><a href="#cb42-82" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-83"><a href="#cb42-83" aria-hidden="true" tabindex="-1"></a><span class="co"># Connect to the database</span></span>
+<span id="cb42-84"><a href="#cb42-84" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>sql duckdb:<span class="op">///</span>data<span class="op">/</span>example_duck.db <span class="op">--</span>alias duck</span>
+<span id="cb42-85"><a href="#cb42-85" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-86"><a href="#cb42-86" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-87"><a href="#cb42-87" aria-hidden="true" tabindex="-1"></a>Now that we’re connected, let’s make some queries!</span>
+<span id="cb42-88"><a href="#cb42-88" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-91"><a href="#cb42-91" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-92"><a href="#cb42-92" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-93"><a href="#cb42-93" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-94"><a href="#cb42-94" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span> FROM Dragon<span class="op">;</span></span>
+<span id="cb42-95"><a href="#cb42-95" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-96"><a href="#cb42-96" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-97"><a href="#cb42-97" aria-hidden="true" tabindex="-1"></a>Thanks to the <span class="in">`pandas`</span> magic, the resulting return data is displayed in a format almost identical to our <span class="in">`pandas`</span> tables but without an index.</span>
+<span id="cb42-98"><a href="#cb42-98" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-99"><a href="#cb42-99" aria-hidden="true" tabindex="-1"></a><span class="fu">## Tables and Schema</span></span>
+<span id="cb42-100"><a href="#cb42-100" aria-hidden="true" tabindex="-1"></a>&lt;p style="text-align:center"&gt;</span>
+<span id="cb42-101"><a href="#cb42-101" aria-hidden="true" tabindex="-1"></a>    &lt;img src='images/sql_terminology.png' width="700px" /&gt;</span>
+<span id="cb42-102"><a href="#cb42-102" aria-hidden="true" tabindex="-1"></a>&lt;/p&gt;</span>
+<span id="cb42-103"><a href="#cb42-103" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-104"><a href="#cb42-104" aria-hidden="true" tabindex="-1"></a>Looking at the <span class="in">`Dragon`</span> table above, we can see that it contains contains three columns. The first of these, <span class="in">`"name"`</span>, contains text data. The <span class="in">`"year"`</span> column contains integer data, with the constraint that year values must be greater than or equal to 2000. The final column, <span class="in">`"cute"`</span>, contains integer data with no restrictions on allowable values.</span>
+<span id="cb42-105"><a href="#cb42-105" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-106"><a href="#cb42-106" aria-hidden="true" tabindex="-1"></a>Now, let's look at the **schema** of our database. A schema describes the logical structure of a table. Whenever a new table is created, the creator must declare its schema.</span>
+<span id="cb42-107"><a href="#cb42-107" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-110"><a href="#cb42-110" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-111"><a href="#cb42-111" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-112"><a href="#cb42-112" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-113"><a href="#cb42-113" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span> </span>
+<span id="cb42-114"><a href="#cb42-114" aria-hidden="true" tabindex="-1"></a>FROM sqlite_master</span>
+<span id="cb42-115"><a href="#cb42-115" aria-hidden="true" tabindex="-1"></a>WHERE <span class="bu">type</span><span class="op">=</span><span class="st">'table'</span></span>
+<span id="cb42-116"><a href="#cb42-116" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-117"><a href="#cb42-117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-118"><a href="#cb42-118" aria-hidden="true" tabindex="-1"></a>The summary above displays information about the database; it contains four tables named <span class="in">`sqlite_sequence`</span>, <span class="in">`Dragon`</span>, <span class="in">`Dish`</span>, and <span class="in">`Scene`</span>. The rightmost column above lists the command that was used to construct each table. </span>
+<span id="cb42-119"><a href="#cb42-119" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-120"><a href="#cb42-120" aria-hidden="true" tabindex="-1"></a>Let's look more closely at the command used to create the <span class="in">`Dragon`</span> table (the second entry above). </span>
+<span id="cb42-121"><a href="#cb42-121" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-122"><a href="#cb42-122" aria-hidden="true" tabindex="-1"></a><span class="in">    CREATE TABLE Dragon (name TEXT PRIMARY KEY,</span></span>
+<span id="cb42-123"><a href="#cb42-123" aria-hidden="true" tabindex="-1"></a><span class="in">                         year INTEGER CHECK (year &gt;= 2000),</span></span>
+<span id="cb42-124"><a href="#cb42-124" aria-hidden="true" tabindex="-1"></a><span class="in">                         cute INTEGER)</span></span>
+<span id="cb42-125"><a href="#cb42-125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-126"><a href="#cb42-126" aria-hidden="true" tabindex="-1"></a>The statement <span class="in">`CREATE TABLE`</span> is used to specify the **schema** of the table – a description of what logic is used to organize the table. Schema follows a set format:</span>
+<span id="cb42-127"><a href="#cb42-127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-128"><a href="#cb42-128" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`ColName`</span>: the name of a column</span>
+<span id="cb42-129"><a href="#cb42-129" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`DataType`</span>: the type of data to be stored in a column. Some of the most common SQL data types are:</span>
+<span id="cb42-130"><a href="#cb42-130" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-131"><a href="#cb42-131" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span><span class="in">`INT`</span> (integers)</span>
+<span id="cb42-132"><a href="#cb42-132" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span><span class="in">`FLOAT`</span> (floating point numbers)</span>
+<span id="cb42-133"><a href="#cb42-133" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span><span class="in">`TEXT`</span> (strings)</span>
+<span id="cb42-134"><a href="#cb42-134" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span><span class="in">`BLOB`</span> (arbitrary data, such as audio/video files)</span>
+<span id="cb42-135"><a href="#cb42-135" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span><span class="in">`DATETIME`</span> (a date and time)</span>
+<span id="cb42-136"><a href="#cb42-136" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`Constraint`</span>: some restriction on the data to be stored in the column. Common constraints are:</span>
+<span id="cb42-137"><a href="#cb42-137" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span><span class="in">`CHECK`</span> (data must obey a certain condition)</span>
+<span id="cb42-138"><a href="#cb42-138" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span><span class="in">`PRIMARY KEY`</span> (designate a column as the table's primary key)</span>
+<span id="cb42-139"><a href="#cb42-139" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span><span class="in">`NOT NULL`</span> (data cannot be null)</span>
+<span id="cb42-140"><a href="#cb42-140" aria-hidden="true" tabindex="-1"></a><span class="ss">  * </span><span class="in">`DEFAULT`</span> (a default fill value if no specific entry is given)</span>
+<span id="cb42-141"><a href="#cb42-141" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-142"><a href="#cb42-142" aria-hidden="true" tabindex="-1"></a>Note that different implementations of SQL (e.g., <span class="co">[</span><span class="ot">DuckDB</span><span class="co">](https://duckdb.org/docs/sql/data_types/overview.html)</span>, <span class="co">[</span><span class="ot">SQLite</span><span class="co">](https://www.sqlite.org/datatype3.html)</span>, <span class="co">[</span><span class="ot">MySQL</span><span class="co">](https://dev.mysql.com/doc/refman/8.0/en/data-types.html)</span>) will support different types. In Data 100, we'll primarily use DuckDB.</span>
+<span id="cb42-143"><a href="#cb42-143" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-144"><a href="#cb42-144" aria-hidden="true" tabindex="-1"></a>Database tables (also referred to as **relations**) are structured much like `DataFrame`s in `pandas`. Each row, sometimes called a **tuple**, represents a single record in the dataset. Each column, sometimes called an **attribute** or **field**, describes some feature of the record. </span>
+<span id="cb42-145"><a href="#cb42-145" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-146"><a href="#cb42-146" aria-hidden="true" tabindex="-1"></a><span class="fu">### Primary Keys</span></span>
+<span id="cb42-147"><a href="#cb42-147" aria-hidden="true" tabindex="-1"></a>The **primary key** is a set of column(s) that uniquely identify each record in the table. In the <span class="in">`Dragon`</span> table, the <span class="in">`"name"`</span> column is its primary key that *uniquely* identifies each entry in the table. Because <span class="in">`"name"`</span> is the primary key of the table, no two entries in the table can have the same name – a given value of <span class="in">`"name"`</span> is unique to each dragon. Primary keys are used to ensure data integrity and to optimize data access.</span>
+<span id="cb42-148"><a href="#cb42-148" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-149"><a href="#cb42-149" aria-hidden="true" tabindex="-1"></a><span class="fu">### Foreign Keys</span></span>
+<span id="cb42-150"><a href="#cb42-150" aria-hidden="true" tabindex="-1"></a>A foreign key is a column or set of columns that references a *primary key in another table*. A foreign key constraint ensures that a primary key exists in the referenced table. For example, let's say we have 2 tables, <span class="in">`student`</span> and <span class="in">`assignment`</span>, with the following schemas: </span>
+<span id="cb42-151"><a href="#cb42-151" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-152"><a href="#cb42-152" aria-hidden="true" tabindex="-1"></a><span class="in">    CREATE TABLE student (</span></span>
+<span id="cb42-153"><a href="#cb42-153" aria-hidden="true" tabindex="-1"></a><span class="in">        student_id INTEGER PRIMARY KEY,</span></span>
+<span id="cb42-154"><a href="#cb42-154" aria-hidden="true" tabindex="-1"></a><span class="in">        name VARCHAR,</span></span>
+<span id="cb42-155"><a href="#cb42-155" aria-hidden="true" tabindex="-1"></a><span class="in">        email VARCHAR</span></span>
+<span id="cb42-156"><a href="#cb42-156" aria-hidden="true" tabindex="-1"></a><span class="in">    );</span></span>
+<span id="cb42-157"><a href="#cb42-157" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-158"><a href="#cb42-158" aria-hidden="true" tabindex="-1"></a><span class="in">    CREATE TABLE assignment (</span></span>
+<span id="cb42-159"><a href="#cb42-159" aria-hidden="true" tabindex="-1"></a><span class="in">        assignment_id INTEGER PRIMARY KEY,</span></span>
+<span id="cb42-160"><a href="#cb42-160" aria-hidden="true" tabindex="-1"></a><span class="in">        description VARCHAR</span></span>
+<span id="cb42-161"><a href="#cb42-161" aria-hidden="true" tabindex="-1"></a><span class="in">    );</span></span>
+<span id="cb42-162"><a href="#cb42-162" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-163"><a href="#cb42-163" aria-hidden="true" tabindex="-1"></a>Note that each table has a primary key that uniquely identifies each student and assignment.</span>
+<span id="cb42-164"><a href="#cb42-164" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-165"><a href="#cb42-165" aria-hidden="true" tabindex="-1"></a>Say we want to create the table <span class="in">`grade`</span> to store the score each student got on each assignment. Naturally, this will depend on the information in <span class="in">`student`</span> and <span class="in">`assignment`</span>; we should not be saving the grade for a nonexisistent student nor a nonexisistent assignment. Hence, we can create the columns <span class="in">`student_id`</span> and <span class="in">`assignment_id`</span> that reference foreign tables <span class="in">`student`</span> and <span class="in">`assignment`</span>, respectively. This way, we ensure that the data in <span class="in">`grade`</span> is always up-to-date with the other tables.  </span>
+<span id="cb42-166"><a href="#cb42-166" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-167"><a href="#cb42-167" aria-hidden="true" tabindex="-1"></a><span class="in">    CREATE TABLE grade (</span></span>
+<span id="cb42-168"><a href="#cb42-168" aria-hidden="true" tabindex="-1"></a><span class="in">        student_id INTEGER,</span></span>
+<span id="cb42-169"><a href="#cb42-169" aria-hidden="true" tabindex="-1"></a><span class="in">        assignment_id INTEGER,</span></span>
+<span id="cb42-170"><a href="#cb42-170" aria-hidden="true" tabindex="-1"></a><span class="in">        score REAL,</span></span>
+<span id="cb42-171"><a href="#cb42-171" aria-hidden="true" tabindex="-1"></a><span class="in">        FOREIGN KEY (student_id) REFERENCES student(student_id),</span></span>
+<span id="cb42-172"><a href="#cb42-172" aria-hidden="true" tabindex="-1"></a><span class="in">        FOREIGN KEY (assignment_id) REFERENCES assignment(assignment_id)</span></span>
+<span id="cb42-173"><a href="#cb42-173" aria-hidden="true" tabindex="-1"></a><span class="in">    );</span></span>
+<span id="cb42-174"><a href="#cb42-174" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-175"><a href="#cb42-175" aria-hidden="true" tabindex="-1"></a><span class="fu">## Basic Queries</span></span>
+<span id="cb42-176"><a href="#cb42-176" aria-hidden="true" tabindex="-1"></a>To extract and manipulate data stored in a SQL table, we will need to familiarize ourselves with the syntax to write pieces of SQL code, which we call **queries**. </span>
+<span id="cb42-177"><a href="#cb42-177" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-178"><a href="#cb42-178" aria-hidden="true" tabindex="-1"></a><span class="fu">### `SELECT`ing From Tables</span></span>
+<span id="cb42-179"><a href="#cb42-179" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-180"><a href="#cb42-180" aria-hidden="true" tabindex="-1"></a>The basic unit of a SQL query is the <span class="in">`SELECT`</span> statement. <span class="in">`SELECT`</span> specifies what columns we would like to extract from a given table. We use <span class="in">`FROM`</span> to tell SQL the table from which we want to <span class="in">`SELECT`</span> our data. </span>
+<span id="cb42-181"><a href="#cb42-181" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-184"><a href="#cb42-184" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-185"><a href="#cb42-185" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-186"><a href="#cb42-186" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-187"><a href="#cb42-187" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb42-188"><a href="#cb42-188" aria-hidden="true" tabindex="-1"></a>FROM Dragon<span class="op">;</span></span>
+<span id="cb42-189"><a href="#cb42-189" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-190"><a href="#cb42-190" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-191"><a href="#cb42-191" aria-hidden="true" tabindex="-1"></a>In SQL, <span class="in">`*`</span> means "everything." The query above grabs *all* the columns in <span class="in">`Dragon`</span> and displays them in the outputted table. We can also specify a specific subset of columns to be <span class="in">`SELECT`</span>ed. Notice that the outputted columns appear in the order they were <span class="in">`SELECT`</span>ed.</span>
+<span id="cb42-192"><a href="#cb42-192" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-195"><a href="#cb42-195" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-196"><a href="#cb42-196" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-197"><a href="#cb42-197" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-198"><a href="#cb42-198" aria-hidden="true" tabindex="-1"></a>SELECT cute, year</span>
+<span id="cb42-199"><a href="#cb42-199" aria-hidden="true" tabindex="-1"></a>FROM Dragon<span class="op">;</span></span>
+<span id="cb42-200"><a href="#cb42-200" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-201"><a href="#cb42-201" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-202"><a href="#cb42-202" aria-hidden="true" tabindex="-1"></a>**Every** SQL query must include both a <span class="in">`SELECT`</span> and <span class="in">`FROM`</span> statement. Intuitively, this makes sense —— we know that we'll want to extract some piece of information from the table; to do so, we also need to indicate what table we want to consider. </span>
+<span id="cb42-203"><a href="#cb42-203" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-204"><a href="#cb42-204" aria-hidden="true" tabindex="-1"></a>It is important to note that SQL enforces a strict "order of operations" —— SQL clauses must *always* follow the same sequence. For example, the <span class="in">`SELECT`</span> statement must always precede <span class="in">`FROM`</span>. This means that any SQL query will follow the same structure. </span>
+<span id="cb42-205"><a href="#cb42-205" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-206"><a href="#cb42-206" aria-hidden="true" tabindex="-1"></a><span class="in">    SELECT &lt;column list&gt;</span></span>
+<span id="cb42-207"><a href="#cb42-207" aria-hidden="true" tabindex="-1"></a><span class="in">    FROM &lt;table&gt;</span></span>
+<span id="cb42-208"><a href="#cb42-208" aria-hidden="true" tabindex="-1"></a><span class="in">    [additional clauses]</span></span>
+<span id="cb42-209"><a href="#cb42-209" aria-hidden="true" tabindex="-1"></a><span class="in">    </span></span>
+<span id="cb42-210"><a href="#cb42-210" aria-hidden="true" tabindex="-1"></a>The additional clauses we use depend on the specific task we're trying to achieve. We may refine our query to filter on a certain condition, aggregate a particular column, or join several tables together. We will spend the rest of this note outlining some useful clauses to build up our understanding of the order of operations.</span>
+<span id="cb42-211"><a href="#cb42-211" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-212"><a href="#cb42-212" aria-hidden="true" tabindex="-1"></a><span class="fu">#### SQL Style Conventions</span></span>
+<span id="cb42-213"><a href="#cb42-213" aria-hidden="true" tabindex="-1"></a>And just like that, we've already written two SQL queries. There are a few things to note in the queries above. Firstly, notice that every "verb" is written in uppercase. It is convention to write SQL operations in capital letters, but your code will run just fine even if you choose to keep things in lowercase. Second, the query above separates each statement with a new line. SQL queries are not impacted by whitespace within the query; this means that SQL code is typically written with a new line after each statement to make things more readable. The semicolon (<span class="in">`;`</span>) indicates the end of a query. There are some "flavors" of SQL in which a query will not run if no semicolon is present; however, in Data 100, the SQL version we will use works with or without an ending semicolon. Queries in these notes will end with semicolons to build up good habits.</span>
+<span id="cb42-214"><a href="#cb42-214" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-215"><a href="#cb42-215" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Aliasing with `AS`</span></span>
+<span id="cb42-216"><a href="#cb42-216" aria-hidden="true" tabindex="-1"></a>The <span class="in">`AS`</span> keyword allows us to give a column a new name (called an **alias**) after it has been <span class="in">`SELECT`</span>ed. The general syntax is:</span>
+<span id="cb42-217"><a href="#cb42-217" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-218"><a href="#cb42-218" aria-hidden="true" tabindex="-1"></a><span class="in">    SELECT column_in_input_table AS new_name_in_output_table</span></span>
+<span id="cb42-219"><a href="#cb42-219" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-222"><a href="#cb42-222" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-223"><a href="#cb42-223" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-224"><a href="#cb42-224" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-225"><a href="#cb42-225" aria-hidden="true" tabindex="-1"></a>SELECT cute AS cuteness, year AS birth</span>
+<span id="cb42-226"><a href="#cb42-226" aria-hidden="true" tabindex="-1"></a>FROM Dragon<span class="op">;</span></span>
+<span id="cb42-227"><a href="#cb42-227" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-228"><a href="#cb42-228" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-229"><a href="#cb42-229" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Uniqueness with `DISTINCT`</span></span>
+<span id="cb42-230"><a href="#cb42-230" aria-hidden="true" tabindex="-1"></a>To <span class="in">`SELECT`</span> only the *unique* values in a column, we use the <span class="in">`DISTINCT`</span> keyword. This will cause any any duplicate entries in a column to be removed. If we want to find only the unique years in <span class="in">`Dragon`</span>, without any repeats, we would write:</span>
+<span id="cb42-231"><a href="#cb42-231" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-234"><a href="#cb42-234" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-235"><a href="#cb42-235" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-236"><a href="#cb42-236" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-237"><a href="#cb42-237" aria-hidden="true" tabindex="-1"></a>SELECT DISTINCT year</span>
+<span id="cb42-238"><a href="#cb42-238" aria-hidden="true" tabindex="-1"></a>FROM Dragon<span class="op">;</span></span>
+<span id="cb42-239"><a href="#cb42-239" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-240"><a href="#cb42-240" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-241"><a href="#cb42-241" aria-hidden="true" tabindex="-1"></a><span class="fu">### Applying `WHERE` Conditions</span></span>
+<span id="cb42-242"><a href="#cb42-242" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-243"><a href="#cb42-243" aria-hidden="true" tabindex="-1"></a>The <span class="in">`WHERE`</span> keyword is used to select only some rows of a table, filtered on a given Boolean condition. </span>
+<span id="cb42-244"><a href="#cb42-244" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-247"><a href="#cb42-247" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-248"><a href="#cb42-248" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-249"><a href="#cb42-249" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-250"><a href="#cb42-250" aria-hidden="true" tabindex="-1"></a>SELECT name, year</span>
+<span id="cb42-251"><a href="#cb42-251" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb42-252"><a href="#cb42-252" aria-hidden="true" tabindex="-1"></a>WHERE cute <span class="op">&gt;</span> <span class="dv">0</span><span class="op">;</span></span>
+<span id="cb42-253"><a href="#cb42-253" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-254"><a href="#cb42-254" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-255"><a href="#cb42-255" aria-hidden="true" tabindex="-1"></a>We can add complexity to the <span class="in">`WHERE`</span> condition using the keywords <span class="in">`AND`</span>, <span class="in">`OR`</span>, and <span class="in">`NOT`</span>, much like we would in Python.</span>
+<span id="cb42-256"><a href="#cb42-256" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-259"><a href="#cb42-259" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-260"><a href="#cb42-260" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-261"><a href="#cb42-261" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-262"><a href="#cb42-262" aria-hidden="true" tabindex="-1"></a>SELECT name, year</span>
+<span id="cb42-263"><a href="#cb42-263" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb42-264"><a href="#cb42-264" aria-hidden="true" tabindex="-1"></a>WHERE cute <span class="op">&gt;</span> <span class="dv">0</span> OR year <span class="op">&gt;</span> <span class="dv">2013</span><span class="op">;</span></span>
+<span id="cb42-265"><a href="#cb42-265" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-266"><a href="#cb42-266" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-267"><a href="#cb42-267" aria-hidden="true" tabindex="-1"></a>To spare ourselves needing to write complicated logical expressions by combining several conditions, we can also filter for entries that are <span class="in">`IN`</span> a specified list of values. This is similar to the use of <span class="in">`in`</span> or <span class="in">`.isin`</span> in Python.</span>
+<span id="cb42-268"><a href="#cb42-268" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-271"><a href="#cb42-271" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-272"><a href="#cb42-272" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-273"><a href="#cb42-273" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-274"><a href="#cb42-274" aria-hidden="true" tabindex="-1"></a>SELECT name, year</span>
+<span id="cb42-275"><a href="#cb42-275" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb42-276"><a href="#cb42-276" aria-hidden="true" tabindex="-1"></a>WHERE name IN (<span class="st">'hiccup'</span>, <span class="st">'puff'</span>)<span class="op">;</span></span>
+<span id="cb42-277"><a href="#cb42-277" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-278"><a href="#cb42-278" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-279"><a href="#cb42-279" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Strings in SQL</span></span>
+<span id="cb42-280"><a href="#cb42-280" aria-hidden="true" tabindex="-1"></a>In <span class="in">`Python`</span>, there is no distinction between double <span class="in">`""`</span> and single quotes <span class="in">`''`</span>. SQL, on the other hand, distinguishes double quotes <span class="in">`""`</span> as *column names* and single quotes `''` as *strings*. For example, we can make the call</span>
+<span id="cb42-281"><a href="#cb42-281" aria-hidden="true" tabindex="-1"></a>  </span>
+<span id="cb42-282"><a href="#cb42-282" aria-hidden="true" tabindex="-1"></a><span class="in">    SELECT "birth weight"</span></span>
+<span id="cb42-283"><a href="#cb42-283" aria-hidden="true" tabindex="-1"></a><span class="in">    FROM patient</span></span>
+<span id="cb42-284"><a href="#cb42-284" aria-hidden="true" tabindex="-1"></a><span class="in">    WHERE "first name" = 'Joey'</span></span>
+<span id="cb42-285"><a href="#cb42-285" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-286"><a href="#cb42-286" aria-hidden="true" tabindex="-1"></a>to select the column <span class="in">`"birth weight"`</span> from the <span class="in">`patient`</span> table and only select rows where the column <span class="in">`"first name"`</span> is equal to <span class="in">`'Joey'`</span>.</span>
+<span id="cb42-287"><a href="#cb42-287" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-288"><a href="#cb42-288" aria-hidden="true" tabindex="-1"></a><span class="fu">#### `WHERE` WITH `NULL` Values</span></span>
+<span id="cb42-289"><a href="#cb42-289" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-290"><a href="#cb42-290" aria-hidden="true" tabindex="-1"></a>You may have noticed earlier that our table actually has a missing value. In SQL, missing data is given the special value <span class="in">`NULL`</span>. <span class="in">`NULL`</span> behaves in a fundamentally different way to other data types. We can't use the typical operators (=, &gt;, and &lt;) on <span class="in">`NULL`</span> values (in fact, <span class="in">`NULL == NULL`</span> returns <span class="in">`False`</span>!). Instead, we check to see if a value <span class="in">`IS`</span> or <span class="in">`IS NOT`</span> <span class="in">`NULL`</span>.</span>
+<span id="cb42-291"><a href="#cb42-291" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-294"><a href="#cb42-294" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-295"><a href="#cb42-295" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-296"><a href="#cb42-296" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-297"><a href="#cb42-297" aria-hidden="true" tabindex="-1"></a>SELECT name, cute</span>
+<span id="cb42-298"><a href="#cb42-298" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb42-299"><a href="#cb42-299" aria-hidden="true" tabindex="-1"></a>WHERE cute IS NOT NULL<span class="op">;</span></span>
+<span id="cb42-300"><a href="#cb42-300" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-301"><a href="#cb42-301" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-302"><a href="#cb42-302" aria-hidden="true" tabindex="-1"></a><span class="fu">### Sorting and Restricting Output</span></span>
+<span id="cb42-303"><a href="#cb42-303" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-304"><a href="#cb42-304" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Sorting with  `ORDER BY`</span></span>
+<span id="cb42-305"><a href="#cb42-305" aria-hidden="true" tabindex="-1"></a>What if we want the output table to appear in a certain order? The <span class="in">`ORDER BY`</span> keyword behaves similarly to <span class="in">`.sort_values()`</span> in <span class="in">`pandas`</span>. </span>
+<span id="cb42-306"><a href="#cb42-306" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-309"><a href="#cb42-309" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-310"><a href="#cb42-310" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-311"><a href="#cb42-311" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-312"><a href="#cb42-312" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb42-313"><a href="#cb42-313" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb42-314"><a href="#cb42-314" aria-hidden="true" tabindex="-1"></a>ORDER BY cute<span class="op">;</span></span>
+<span id="cb42-315"><a href="#cb42-315" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-316"><a href="#cb42-316" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-317"><a href="#cb42-317" aria-hidden="true" tabindex="-1"></a>By default, <span class="in">`ORDER BY`</span> will display results in ascending order (<span class="in">`ASC`</span>) with the lowest values at the top of the table. To sort in descending order, we use the <span class="in">`DESC`</span> keyword after specifying the column to be used for ordering.</span>
+<span id="cb42-318"><a href="#cb42-318" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-321"><a href="#cb42-321" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-322"><a href="#cb42-322" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-323"><a href="#cb42-323" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-324"><a href="#cb42-324" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb42-325"><a href="#cb42-325" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb42-326"><a href="#cb42-326" aria-hidden="true" tabindex="-1"></a>ORDER BY cute DESC<span class="op">;</span></span>
+<span id="cb42-327"><a href="#cb42-327" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-328"><a href="#cb42-328" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-329"><a href="#cb42-329" aria-hidden="true" tabindex="-1"></a>We can also tell SQL to <span class="in">`ORDER BY`</span> two columns at once. This will sort the table by the first listed column, then use the values in the second listed column to break any ties.</span>
+<span id="cb42-330"><a href="#cb42-330" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-333"><a href="#cb42-333" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-334"><a href="#cb42-334" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-335"><a href="#cb42-335" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-336"><a href="#cb42-336" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb42-337"><a href="#cb42-337" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb42-338"><a href="#cb42-338" aria-hidden="true" tabindex="-1"></a>ORDER BY year, cute DESC<span class="op">;</span></span>
+<span id="cb42-339"><a href="#cb42-339" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-340"><a href="#cb42-340" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-341"><a href="#cb42-341" aria-hidden="true" tabindex="-1"></a>Note that in this example, <span class="in">`year`</span> is sorted in ascending order and <span class="in">`cute`</span> in descending order. If you want <span class="in">`year`</span> to be ordered in descending order as well, you need to specify <span class="in">`year DESC, cute DESC;`</span>.</span>
+<span id="cb42-342"><a href="#cb42-342" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-343"><a href="#cb42-343" aria-hidden="true" tabindex="-1"></a><span class="fu">#### `LIMIT` vs. `OFFSET`</span></span>
+<span id="cb42-344"><a href="#cb42-344" aria-hidden="true" tabindex="-1"></a>In many instances, we are only concerned with a certain number of rows in the output table (for example, wanting to find the first two dragons in the table). The <span class="in">`LIMIT`</span> keyword restricts the output to a specified number of rows. It serves a function similar to that of <span class="in">`.head()`</span> in <span class="in">`pandas`</span>.</span>
+<span id="cb42-345"><a href="#cb42-345" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-348"><a href="#cb42-348" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-349"><a href="#cb42-349" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-350"><a href="#cb42-350" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-351"><a href="#cb42-351" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb42-352"><a href="#cb42-352" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb42-353"><a href="#cb42-353" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">2</span><span class="op">;</span></span>
+<span id="cb42-354"><a href="#cb42-354" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-355"><a href="#cb42-355" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-356"><a href="#cb42-356" aria-hidden="true" tabindex="-1"></a>The <span class="in">`OFFSET`</span> keyword indicates the index at which <span class="in">`LIMIT`</span> should start. In other words, we can use <span class="in">`OFFSET`</span> to shift where the <span class="in">`LIMIT`</span>ing begins by a specified number of rows. For example, we might care about the dragons that are at positions 2 and 3 in the table. </span>
+<span id="cb42-357"><a href="#cb42-357" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-360"><a href="#cb42-360" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb42-361"><a href="#cb42-361" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb42-362"><a href="#cb42-362" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb42-363"><a href="#cb42-363" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb42-364"><a href="#cb42-364" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb42-365"><a href="#cb42-365" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">2</span></span>
+<span id="cb42-366"><a href="#cb42-366" aria-hidden="true" tabindex="-1"></a>OFFSET <span class="dv">1</span><span class="op">;</span></span>
+<span id="cb42-367"><a href="#cb42-367" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb42-368"><a href="#cb42-368" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-369"><a href="#cb42-369" aria-hidden="true" tabindex="-1"></a>With these keywords in hand, let's update our SQL order of operations. Remember: *every* SQL query must list clauses in this order. </span>
+<span id="cb42-370"><a href="#cb42-370" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-371"><a href="#cb42-371" aria-hidden="true" tabindex="-1"></a><span class="in">    SELECT &lt;column expression list&gt;</span></span>
+<span id="cb42-372"><a href="#cb42-372" aria-hidden="true" tabindex="-1"></a><span class="in">    FROM &lt;table&gt;</span></span>
+<span id="cb42-373"><a href="#cb42-373" aria-hidden="true" tabindex="-1"></a><span class="in">    [WHERE &lt;predicate&gt;]</span></span>
+<span id="cb42-374"><a href="#cb42-374" aria-hidden="true" tabindex="-1"></a><span class="in">    [ORDER BY &lt;column list&gt;]</span></span>
+<span id="cb42-375"><a href="#cb42-375" aria-hidden="true" tabindex="-1"></a><span class="in">    [LIMIT &lt;number of rows&gt;]</span></span>
+<span id="cb42-376"><a href="#cb42-376" aria-hidden="true" tabindex="-1"></a><span class="in">    [OFFSET &lt;number of rows&gt;];</span></span>
+<span id="cb42-377"><a href="#cb42-377" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-378"><a href="#cb42-378" aria-hidden="true" tabindex="-1"></a><span class="fu">## Summary</span></span>
+<span id="cb42-379"><a href="#cb42-379" aria-hidden="true" tabindex="-1"></a>Let's summarize what we've learned so far. We know that <span class="in">`SELECT`</span> and <span class="in">`FROM`</span> are the fundamental building blocks of any SQL query. We can augment these two keywords with additional clauses to refine the data in our output table. </span>
+<span id="cb42-380"><a href="#cb42-380" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-381"><a href="#cb42-381" aria-hidden="true" tabindex="-1"></a>Any clauses that we include must follow a strict ordering within the query:</span>
+<span id="cb42-382"><a href="#cb42-382" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb42-383"><a href="#cb42-383" aria-hidden="true" tabindex="-1"></a><span class="in">    SELECT &lt;column list&gt;</span></span>
+<span id="cb42-384"><a href="#cb42-384" aria-hidden="true" tabindex="-1"></a><span class="in">    FROM &lt;table&gt;</span></span>
+<span id="cb42-385"><a href="#cb42-385" aria-hidden="true" tabindex="-1"></a><span class="in">    [WHERE &lt;predicate&gt;]</span></span>
+<span id="cb42-386"><a href="#cb42-386" aria-hidden="true" tabindex="-1"></a><span class="in">    [ORDER BY &lt;column list&gt;]</span></span>
+<span id="cb42-387"><a href="#cb42-387" aria-hidden="true" tabindex="-1"></a><span class="in">    [LIMIT &lt;number of rows&gt;]</span></span>
+<span id="cb42-388"><a href="#cb42-388" aria-hidden="true" tabindex="-1"></a><span class="in">    [OFFSET &lt;number of rows&gt;]</span></span>
+<span id="cb42-389"><a href="#cb42-389" aria-hidden="true" tabindex="-1"></a><span class="in">    </span></span>
+<span id="cb42-390"><a href="#cb42-390" aria-hidden="true" tabindex="-1"></a>Here, any clause contained in square brackets <span class="in">`[ ]`</span> is optional —— we only need to use the keyword if it is relevant to the table operation we want to perform. Also note that by convention, we use all caps for keywords in SQL statements and use newlines to make code more readable.</span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/sql_II/images/cats.png b/docs/sql_II/images/cats.png
new file mode 100644
index 000000000..090796220
Binary files /dev/null and b/docs/sql_II/images/cats.png differ
diff --git a/docs/sql_II/images/cross.png b/docs/sql_II/images/cross.png
new file mode 100644
index 000000000..421a0668f
Binary files /dev/null and b/docs/sql_II/images/cross.png differ
diff --git a/docs/sql_II/images/full.png b/docs/sql_II/images/full.png
new file mode 100644
index 000000000..84eb20fef
Binary files /dev/null and b/docs/sql_II/images/full.png differ
diff --git a/docs/sql_II/images/inner.png b/docs/sql_II/images/inner.png
new file mode 100644
index 000000000..ce9830378
Binary files /dev/null and b/docs/sql_II/images/inner.png differ
diff --git a/docs/sql_II/images/left.png b/docs/sql_II/images/left.png
new file mode 100644
index 000000000..43482170b
Binary files /dev/null and b/docs/sql_II/images/left.png differ
diff --git a/docs/sql_II/images/multidimensional.png b/docs/sql_II/images/multidimensional.png
new file mode 100644
index 000000000..f3e2582fb
Binary files /dev/null and b/docs/sql_II/images/multidimensional.png differ
diff --git a/docs/sql_II/images/right.png b/docs/sql_II/images/right.png
new file mode 100644
index 000000000..53baaeaaa
Binary files /dev/null and b/docs/sql_II/images/right.png differ
diff --git a/docs/sql_II/images/star.png b/docs/sql_II/images/star.png
new file mode 100644
index 000000000..bc9643a26
Binary files /dev/null and b/docs/sql_II/images/star.png differ
diff --git a/docs/sql_II/sql_II.html b/docs/sql_II/sql_II.html
new file mode 100644
index 000000000..362d8dcdf
--- /dev/null
+++ b/docs/sql_II/sql_II.html
@@ -0,0 +1,1903 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>21&nbsp; SQL II – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../logistic_regression_1/logistic_reg_1.html" rel="next">
+<link href="../sql_I/sql_I.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../sql_II/sql_II.html"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-full">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">SQL II</h2>
+   
+  <ul>
+  <li><a href="#aggregating-with-group-by" id="toc-aggregating-with-group-by" class="nav-link active" data-scroll-target="#aggregating-with-group-by"><span class="header-section-number">21.1</span> Aggregating with <code>GROUP BY</code></a></li>
+  <li><a href="#filtering-groups" id="toc-filtering-groups" class="nav-link" data-scroll-target="#filtering-groups"><span class="header-section-number">21.2</span> Filtering Groups</a></li>
+  <li><a href="#summary-sql" id="toc-summary-sql" class="nav-link" data-scroll-target="#summary-sql"><span class="header-section-number">21.3</span> Summary: SQL</a></li>
+  <li><a href="#eda-in-sql" id="toc-eda-in-sql" class="nav-link" data-scroll-target="#eda-in-sql"><span class="header-section-number">21.4</span> EDA in SQL</a>
+  <ul>
+  <li><a href="#matching-text-using-like" id="toc-matching-text-using-like" class="nav-link" data-scroll-target="#matching-text-using-like"><span class="header-section-number">21.4.1</span> Matching Text using <code>LIKE</code></a></li>
+  <li><a href="#casting-data-types" id="toc-casting-data-types" class="nav-link" data-scroll-target="#casting-data-types"><span class="header-section-number">21.4.2</span> <code>CAST</code>ing Data Types</a></li>
+  <li><a href="#using-conditional-statements-with-case" id="toc-using-conditional-statements-with-case" class="nav-link" data-scroll-target="#using-conditional-statements-with-case"><span class="header-section-number">21.4.3</span> Using Conditional Statements with <code>CASE</code></a></li>
+  </ul></li>
+  <li><a href="#joining-tables" id="toc-joining-tables" class="nav-link" data-scroll-target="#joining-tables"><span class="header-section-number">21.5</span> <code>JOIN</code>ing Tables</a>
+  <ul>
+  <li><a href="#aliasing-in-joins" id="toc-aliasing-in-joins" class="nav-link" data-scroll-target="#aliasing-in-joins"><span class="header-section-number">21.5.1</span> Aliasing in <code>JOIN</code>s</a></li>
+  <li><a href="#common-table-expressions" id="toc-common-table-expressions" class="nav-link" data-scroll-target="#common-table-expressions"><span class="header-section-number">21.5.2</span> Common Table Expressions</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content column-body" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<div class="quarto-title-block"><div><h1 class="title"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+</div>
+
+
+
+<div class="quarto-title-meta column-body">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Perform aggregations using <code>GROUP BY</code></li>
+<li>Introduce the ability to filter groups</li>
+<li>Perform data cleaning and text manipulation in SQL</li>
+<li>Join data across tables</li>
+</ul>
+</div>
+</div>
+</div>
+<p>In this lecture, we’ll continue our work from last time to introduce some advanced SQL syntax.</p>
+<p>First, let’s load in the <code>basic_examples.db</code> database.</p>
+<div id="7fb120ae" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Load the SQL Alchemy Python library and DuckDB</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sqlalchemy</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> duckdb</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+</div>
+<div id="b6386da5" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Load %%sql cell magic</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>load_ext sql</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<div id="b56f3cde" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Connect to the database</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>sql duckdb:<span class="op">///</span>data<span class="op">/</span>basic_examples.db <span class="op">--</span>alias basic</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<section id="aggregating-with-group-by" class="level2" data-number="21.1">
+<h2 data-number="21.1" class="anchored" data-anchor-id="aggregating-with-group-by"><span class="header-section-number">21.1</span> Aggregating with <code>GROUP BY</code></h2>
+<p>At this point, we’ve seen that SQL offers much of the same functionality that was given to us by <code>pandas</code>. We can extract data from a table, filter it, and reorder it to suit our needs.</p>
+<p>In <code>pandas</code>, much of our analysis work relied heavily on being able to use <code>.groupby()</code> to aggregate across the rows of our dataset. SQL’s answer to this task is the (very conveniently named) <code>GROUP BY</code> clause. While the outputs of <code>GROUP BY</code> are similar to those of <code>.groupby()</code> —— in both cases, we obtain an output table where some column has been used for grouping —— the syntax and logic used to group data in SQL are fairly different to the <code>pandas</code> implementation.</p>
+<p>To illustrate <code>GROUP BY</code>, we will consider the <code>Dish</code> table from our database.</p>
+<div id="4d1072f6" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span> </span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>FROM Dish<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">type</th>
+<th data-quarto-table-cell-role="th">cost</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>Notice that there are multiple dishes of the same <code>type</code>. What if we wanted to find the total costs of dishes of a certain <code>type</code>? To accomplish this, we would write the following code.</p>
+<div id="5834a676" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="bu">type</span>, SUM(cost)</span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>FROM Dish</span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>GROUP BY <span class="bu">type</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">type</th>
+<th data-quarto-table-cell-role="th">sum("cost")</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>What is going on here? The statement <code>GROUP BY type</code> tells SQL to group the data based on the value contained in the <code>type</code> column (whether a record is an appetizer, entree, or dessert). <code>SUM(cost)</code> sums up the costs of dishes in each <code>type</code> and displays the result in the output table.</p>
+<p>You may be wondering: why does <code>SUM(cost)</code> come before the command to <code>GROUP BY type</code>? Don’t we need to form groups before we can count the number of entries in each? Remember that SQL is a <em>declarative</em> programming language —— a SQL programmer simply states what end result they would like to see, and leaves the task of figuring out <em>how</em> to obtain this result to SQL itself. This means that SQL queries sometimes don’t follow what a reader sees as a “logical” sequence of thought. Instead, SQL requires that we follow its set order of operations when constructing queries. So long as we follow this order, SQL will handle the underlying logic.</p>
+<p>In practical terms: our goal with this query was to output the total <code>cost</code>s of each <code>type</code>. To communicate this to SQL, we say that we want to <code>SELECT</code> the <code>SUM</code>med <code>cost</code> values for each <code>type</code> group.</p>
+<p>There are many aggregation functions that can be used to aggregate the data contained in each group. Some common examples are:</p>
+<ul>
+<li><code>COUNT</code>: count the number of rows associated with each group</li>
+<li><code>MIN</code>: find the minimum value of each group</li>
+<li><code>MAX</code>: find the maximum value of each group</li>
+<li><code>SUM</code>: sum across all records in each group</li>
+<li><code>AVG</code>: find the average value of each group</li>
+</ul>
+<p>We can easily compute multiple aggregations all at once (a task that was very tricky in <code>pandas</code>).</p>
+<div id="924d1ca3" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="bu">type</span>, SUM(cost), MIN(cost), MAX(name)</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>FROM Dish</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>GROUP BY <span class="bu">type</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">type</th>
+<th data-quarto-table-cell-role="th">sum("cost")</th>
+<th data-quarto-table-cell-role="th">min("cost")</th>
+<th data-quarto-table-cell-role="th">max("name")</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>To count the number of rows associated with each group, we use the <code>COUNT</code> keyword. Calling <code>COUNT(*)</code> will compute the total number of rows in each group, including rows with null values. Its <code>pandas</code> equivalent is <code>.groupby().size()</code>.</p>
+<p>Recall the <code>Dragon</code> table from the previous lecture:</p>
+<div id="2f89a42c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span> FROM Dragon<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">name</th>
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">cute</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>Notice that <code>COUNT(*)</code> and <code>COUNT(cute)</code> result in different outputs.</p>
+<div id="0a9aa412" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>SELECT year, COUNT(<span class="op">*</span>)</span>
+<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>GROUP BY year<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">count_star()</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<div id="1c2c947f" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>SELECT year, COUNT(cute)</span>
+<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a>GROUP BY year<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="9">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">year</th>
+<th data-quarto-table-cell-role="th">count(cute)</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>With this definition of <code>GROUP BY</code> in hand, let’s update our SQL order of operations. Remember: <em>every</em> SQL query must list clauses in this order.</p>
+<pre><code>SELECT &lt;column expression list&gt;
+FROM &lt;table&gt;
+[WHERE &lt;predicate&gt;]
+[GROUP BY &lt;column list&gt;]
+[ORDER BY &lt;column list&gt;]
+[LIMIT &lt;number of rows&gt;]
+[OFFSET &lt;number of rows&gt;];</code></pre>
+<p>Note that we can use the <code>AS</code> keyword to rename columns during the selection process and that column expressions may include aggregation functions (<code>MAX</code>, <code>MIN</code>, etc.).</p>
+</section>
+<section id="filtering-groups" class="level2" data-number="21.2">
+<h2 data-number="21.2" class="anchored" data-anchor-id="filtering-groups"><span class="header-section-number">21.2</span> Filtering Groups</h2>
+<p>Now, what if we only want groups that meet a certain condition? <code>HAVING</code> filters groups by applying some condition across all rows in each group. We interpret it as a way to keep only the groups <code>HAVING</code> some condition. Note the difference between <code>WHERE</code> and <code>HAVING</code>: we use <code>WHERE</code> to filter rows, whereas we use <code>HAVING</code> to filter <em>groups</em>. <code>WHERE</code> precedes <code>HAVING</code> in terms of how SQL executes a query.</p>
+<p>Let’s take a look at the <code>Dish</code> table to see how we can use <code>HAVING</code>. Say we want to group dishes with a cost greater than 4 by <code>type</code> and only keep groups where the max cost is less than 10.</p>
+<div id="50e85973" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>SELECT <span class="bu">type</span>, COUNT(<span class="op">*</span>)</span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>FROM Dish</span>
+<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a>WHERE cost <span class="op">&gt;</span> <span class="dv">4</span></span>
+<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a>GROUP BY <span class="bu">type</span></span>
+<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a>HAVING MAX(cost) <span class="op">&lt;</span>  <span class="dv">10</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+Done.</code></pre>
+</div>
+<div class="cell-output cell-output-display" data-execution_count="10">
+<table class="caption-top" data-quarto-postprocess="true">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">type</th>
+<th data-quarto-table-cell-role="th">count_star()</th>
+</tr>
+</thead>
+<tbody>
+</tbody>
+</table>
+</div>
+</div>
+<p>Here, we first use <code>WHERE</code> to filter for rows with a cost greater than 4. We then group our values by <code>type</code> before applying the <code>HAVING</code> operator. With <code>HAVING</code>, we can filter our groups based on if the max cost is less than 10.</p>
+</section>
+<section id="summary-sql" class="level2" data-number="21.3">
+<h2 data-number="21.3" class="anchored" data-anchor-id="summary-sql"><span class="header-section-number">21.3</span> Summary: SQL</h2>
+<p>With this definition of <code>GROUP BY</code> and <code>HAVING</code> in hand, let’s update our SQL order of operations. Remember: <em>every</em> SQL query must list clauses in this order.</p>
+<pre><code>SELECT &lt;column expression list&gt;
+FROM &lt;table&gt;
+[WHERE &lt;predicate&gt;]
+[GROUP BY &lt;column list&gt;]
+[ORDER BY &lt;column list&gt;]
+[LIMIT &lt;number of rows&gt;]
+[OFFSET &lt;number of rows&gt;];</code></pre>
+<p>Note that we can use the <code>AS</code> keyword to rename columns during the selection process and that column expressions may include aggregation functions (<code>MAX</code>, <code>MIN</code>, etc.).</p>
+</section>
+<section id="eda-in-sql" class="level2" data-number="21.4">
+<h2 data-number="21.4" class="anchored" data-anchor-id="eda-in-sql"><span class="header-section-number">21.4</span> EDA in SQL</h2>
+<p>In the last lecture, we mostly worked under the assumption that our data had already been cleaned. However, as we saw in our first pass through the data science lifecycle, we’re very unlikely to be given data that is free of formatting issues. With this in mind, we’ll want to learn how to clean and transform data in SQL.</p>
+<p>Our typical workflow when working with “big data” is:</p>
+<ol type="1">
+<li>Use SQL to query data from a database</li>
+<li>Use Python (with <code>pandas</code>) to analyze this data in detail</li>
+</ol>
+<p>We can, however, still perform simple data cleaning and re-structuring using SQL directly. To do so, we’ll use the <code>Title</code> table from the <code>imdb_duck</code> database, which contains information about movies and actors.</p>
+<p>Let’s load in the <code>imdb_duck</code> database.</p>
+<div id="9f81f9e9" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>os.environ[<span class="st">"TQDM_DISABLE"</span>] <span class="op">=</span> <span class="st">"1"</span></span>
+<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> os.path.exists(<span class="st">"/home/jovyan/shared/sql/imdb_duck.db"</span>):</span>
+<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a>    imdbpath <span class="op">=</span> <span class="st">"duckdb:////home/jovyan/shared/sql/imdb_duck.db"</span></span>
+<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a><span class="cf">elif</span> os.path.exists(<span class="st">"data/imdb_duck.db"</span>):</span>
+<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>    imdbpath <span class="op">=</span>  <span class="st">"duckdb:///data/imdb_duck.db"</span></span>
+<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a><span class="cf">else</span>:</span>
+<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a>    <span class="im">import</span> gdown</span>
+<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a>    url <span class="op">=</span> <span class="st">'https://drive.google.com/uc?id=10tKOHGLt9QoOgq5Ii-FhxpB9lDSQgl1O'</span></span>
+<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a>    output_path <span class="op">=</span> <span class="st">'data/imdb_duck.db'</span></span>
+<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a>    gdown.download(url, output_path, quiet<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a>    imdbpath <span class="op">=</span> <span class="st">"duckdb:///data/imdb_duck.db"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<div id="ed1857a0" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sqlalchemy <span class="im">import</span> create_engine</span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>imdb_engine <span class="op">=</span> create_engine(imdbpath, connect_args<span class="op">=</span>{<span class="st">'read_only'</span>: <span class="va">True</span>})</span>
+<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>sql imdb_engine <span class="op">--</span>alias imdb</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.ParserException) Parser Error: syntax error at or near "imdb_engine"
+[SQL: imdb_engine]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+<p>Since we’ll be working with the <code>Title</code> table, let’s take a quick look at what it contains.</p>
+<div id="c5c1cd6e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql imdb </span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a>WHERE primaryTitle IN (<span class="st">'Ginny &amp; Georgia'</span>, <span class="st">'What If...?'</span>, <span class="st">'Succession'</span>, <span class="st">'Veep'</span>, <span class="st">'Tenet'</span>)</span>
+<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">10</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.ParserException) Parser Error: syntax error at or near "imdb"
+[SQL: imdb
+    
+SELECT *
+FROM Title
+WHERE primaryTitle IN ('Ginny &amp; Georgia', 'What If...?', 'Succession', 'Veep', 'Tenet')
+LIMIT 10;]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+<section id="matching-text-using-like" class="level3" data-number="21.4.1">
+<h3 data-number="21.4.1" class="anchored" data-anchor-id="matching-text-using-like"><span class="header-section-number">21.4.1</span> Matching Text using <code>LIKE</code></h3>
+<p>One common task we encountered in our first look at EDA was needing to match string data. For example, we might want to remove entries beginning with the same prefix as part of the data cleaning process.</p>
+<p>In SQL, we use the <code>LIKE</code> operator to (you guessed it) look for strings that are <em>like</em> a given string pattern.</p>
+<div id="94baefb5" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a>SELECT titleType, primaryTitle</span>
+<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a>WHERE primaryTitle LIKE <span class="st">'Star Wars: Episode I - The Phantom Menace'</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "system.information_schema.tables"?
+LINE 2: FROM Title
+             ^
+[SQL: SELECT titleType, primaryTitle
+FROM Title
+WHERE primaryTitle LIKE 'Star Wars: Episode I - The Phantom Menace']
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+<p>What if we wanted to find <em>all</em> Star Wars movies? <code>%</code> is the wildcard operator, it means “look for any character, any number of times”. This makes it helpful for identifying strings that are similar to our desired pattern, even when we don’t know the full text of what we aim to extract.</p>
+<div id="8c483e2c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>SELECT titleType, primaryTitle</span>
+<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a>WHERE primaryTitle LIKE <span class="st">'%Star Wars%'</span></span>
+<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">10</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "system.information_schema.tables"?
+LINE 2: FROM Title
+             ^
+[SQL: SELECT titleType, primaryTitle
+FROM Title
+WHERE primaryTitle LIKE '%Star Wars%'
+LIMIT 10;]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+<p>Alternatively, we can use RegEx! DuckDB and most real DBMSs allow for this. Note that here, we have to use the <code>SIMILAR TO</code> operater rather than <code>LIKE</code>.</p>
+<div id="677e6c4d" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a>SELECT titleType, primaryTitle</span>
+<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a>WHERE primaryTitle SIMILAR TO <span class="st">'.*Star Wars*.'</span></span>
+<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">10</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "system.information_schema.tables"?
+LINE 2: FROM Title
+             ^
+[SQL: SELECT titleType, primaryTitle
+FROM Title
+WHERE primaryTitle SIMILAR TO '.*Star Wars*.'
+LIMIT 10;]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+</section>
+<section id="casting-data-types" class="level3" data-number="21.4.2">
+<h3 data-number="21.4.2" class="anchored" data-anchor-id="casting-data-types"><span class="header-section-number">21.4.2</span> <code>CAST</code>ing Data Types</h3>
+<p>A common data cleaning task is converting data to the correct variable type. The <code>CAST</code> keyword is used to generate a new output column. Each entry in this output column is the result of converting the data in an existing column to a new data type. For example, we may wish to convert numeric data stored as a string to an integer.</p>
+<div id="89ab5ad3" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle, CAST(runtimeMinutes AS INT)</span>
+<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a>FROM Title<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "system.information_schema.tables"?
+LINE 2: FROM Title;
+             ^
+[SQL: SELECT primaryTitle, CAST(runtimeMinutes AS INT)
+FROM Title;]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+<p>We use <code>CAST</code> when <code>SELECT</code>ing colunns for our output table. In the example above, we want to <code>SELECT</code> the columns of integer year and runtime data that is created by the <code>CAST</code>.</p>
+<p>SQL will automatically name a new column according to the command used to <code>SELECT</code> it, which can lead to unwieldy column names. We can rename the <code>CAST</code>ed column using the <code>AS</code> keyword.</p>
+<div id="3f5f31dc" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle AS title, CAST(runtimeMinutes AS INT) AS minutes, CAST(startYear AS INT) AS year</span>
+<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">5</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "system.information_schema.tables"?
+LINE 2: FROM Title
+             ^
+[SQL: SELECT primaryTitle AS title, CAST(runtimeMinutes AS INT) AS minutes, CAST(startYear AS INT) AS year
+FROM Title
+LIMIT 5;]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+</section>
+<section id="using-conditional-statements-with-case" class="level3" data-number="21.4.3">
+<h3 data-number="21.4.3" class="anchored" data-anchor-id="using-conditional-statements-with-case"><span class="header-section-number">21.4.3</span> Using Conditional Statements with <code>CASE</code></h3>
+<p>When working with <code>pandas</code>, we often ran into situations where we wanted to generate new columns using some form of conditional statement. For example, say we wanted to describe a film title as “old,” “mid-aged,” or “new,” depending on the year of its release.</p>
+<p>In SQL, conditional operations are performed using a <code>CASE</code> clause. Conceptually, <code>CASE</code> behaves much like the <code>CAST</code> operation: it creates a new column that we can then <code>SELECT</code> to appear in the output. The syntax for a <code>CASE</code> clause is as follows:</p>
+<pre><code>CASE WHEN &lt;condition&gt; THEN &lt;value&gt;
+     WHEN &lt;other condition&gt; THEN &lt;other value&gt;
+     ...
+     ELSE &lt;yet another value&gt;
+     END</code></pre>
+<p>Scanning through the skeleton code above, you can see that the logic is similar to that of an <code>if</code> statement in Python. The conditional statement is first opened by calling <code>CASE</code>. Each new condition is specified by <code>WHEN</code>, with <code>THEN</code> indicating what value should be filled if the condition is met. <code>ELSE</code> specifies the value that should be filled if no other conditions are met. Lastly, <code>END</code> indicates the end of the conditional statement; once <code>END</code> has been called, SQL will continue evaluating the query as usual.</p>
+<p>Let’s see this in action. In the example below, we give the new column created by the <code>CASE</code> statement the name <code>movie_age</code>.</p>
+<div id="db3c4e6e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="19">
+<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a><span class="op">/*</span> If a movie was filmed before <span class="dv">1950</span>, it <span class="kw">is</span> <span class="st">"old"</span></span>
+<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a>Otherwise, <span class="cf">if</span> a movie was filmed before <span class="dv">2000</span>, it <span class="kw">is</span> <span class="st">"mid-aged"</span></span>
+<span id="cb36-4"><a href="#cb36-4" aria-hidden="true" tabindex="-1"></a>Else, a movie <span class="kw">is</span> <span class="st">"new"</span> <span class="op">*/</span></span>
+<span id="cb36-5"><a href="#cb36-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb36-6"><a href="#cb36-6" aria-hidden="true" tabindex="-1"></a>SELECT titleType, startYear,</span>
+<span id="cb36-7"><a href="#cb36-7" aria-hidden="true" tabindex="-1"></a>CASE WHEN startYear <span class="op">&lt;</span> <span class="dv">1950</span> THEN <span class="st">'old'</span></span>
+<span id="cb36-8"><a href="#cb36-8" aria-hidden="true" tabindex="-1"></a>     WHEN startYear <span class="op">&lt;</span> <span class="dv">2000</span> THEN <span class="st">'mid-aged'</span></span>
+<span id="cb36-9"><a href="#cb36-9" aria-hidden="true" tabindex="-1"></a>     ELSE <span class="st">'new'</span></span>
+<span id="cb36-10"><a href="#cb36-10" aria-hidden="true" tabindex="-1"></a>     END AS movie_age</span>
+<span id="cb36-11"><a href="#cb36-11" aria-hidden="true" tabindex="-1"></a>FROM Title<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "system.information_schema.tables"?
+LINE 10: FROM Title;
+              ^
+[SQL: /* If a movie was filmed before 1950, it is "old"
+Otherwise, if a movie was filmed before 2000, it is "mid-aged"
+Else, a movie is "new" */
+
+SELECT titleType, startYear,
+CASE WHEN startYear &lt; 1950 THEN 'old'
+     WHEN startYear &lt; 2000 THEN 'mid-aged'
+     ELSE 'new'
+     END AS movie_age
+FROM Title;]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+</section>
+</section>
+<section id="joining-tables" class="level2" data-number="21.5">
+<h2 data-number="21.5" class="anchored" data-anchor-id="joining-tables"><span class="header-section-number">21.5</span> <code>JOIN</code>ing Tables</h2>
+<p>At this point, we’re well-versed in using SQL as a tool to clean, manipulate, and transform data in a table. Notice that this sentence referred to one <em>table</em>, specifically. What happens if the data we need is distributed across multiple tables? This is an important consideration when using SQL —— recall that we first introduced SQL as a language to query from databases. Databases often store data in a multidimensional structure. In other words, information is stored across several tables, with each table containing a small subset of all the data housed by the database.</p>
+<p>A common way of organizing a database is by using a <strong>star schema</strong>. A star schema is composed of two types of tables. A <strong>fact table</strong> is the central table of the database —— it contains the information needed to link entries across several <strong>dimension tables</strong>, which contain more detailed information about the data.</p>
+<p>Say we were working with a database about boba offerings in Berkeley. The dimension tables of the database might contain information about tea varieties and boba toppings. The fact table would be used to link this information across the various dimension tables.</p>
+<div style="text-align: center;">
+<p><img src="images/multidimensional.png" alt="multidimensional" width="850"></p>
+</div>
+<p>If we explicitly mark the relationships between tables, we start to see the star-like structure of the star schema.</p>
+<div style="text-align: center;">
+<p><img src="images/star.png" alt="star" width="650"></p>
+</div>
+<p>To join data across multiple tables, we’ll use the (creatively named) <code>JOIN</code> keyword. We’ll make things easier for now by first considering the simpler <code>cats</code> dataset, which consists of the tables <code>s</code> and <code>t</code>.</p>
+<div style="text-align: center;">
+<p><img src="images/cats.png" alt="cats" width="500"></p>
+</div>
+<p>To perform a join, we amend the <code>FROM</code> clause. You can think of this as saying, “<code>SELECT</code> my data <code>FROM</code> tables that have been <code>JOIN</code>ed together.”</p>
+<p>Remember: SQL does not consider newlines or whitespace when interpreting queries. The indentation given in the example below is to help improve readability. If you wish, you can write code that does not follow this formatting.</p>
+<pre><code>SELECT &lt;column list&gt;
+FROM table_1 
+    JOIN table_2 
+    ON key_1 = key_2;</code></pre>
+<p>We also need to specify what column from each table should be used to determine matching entries. By defining these keys, we provide SQL with the information it needs to pair rows of data together.</p>
+<p>The most commonly used type of SQL <code>JOIN</code> is the <strong>inner join</strong>. It turns out you’re already familiar with what an inner join does, and how it works – this is the type of join we’ve been using in <code>pandas</code> all along! In an inner join, we combine every row in our first table with its matching entry in the second table. If a row from either table does not have a match in the other table, it is omitted from the output.</p>
+<div style="text-align: center;">
+<p><img src="images/inner.png" alt="inner" width="800"></p>
+</div>
+<p>In a <strong>cross join</strong>, <em>all</em> possible combinations of rows appear in the output table, regardless of whether or not rows share a matching key. Because all rows are joined, even if there is no matching key, it is not necessary to specify what keys to consider in an <code>ON</code> statement. A cross join is also known as a cartesian product.</p>
+<div style="text-align: center;">
+<p><img src="images/cross.png" alt="cross" width="800"></p>
+</div>
+<p>Conceptually, we can interpret an inner join as a cross join, followed by removing all rows that do not share a matching key. Notice that the output of the inner join above contains all rows of the cross join example that contain a single color across the entire row.</p>
+<p>In a <strong>left outer join</strong>, <em>all</em> rows in the left table are kept in the output table. If a row in the right table shares a match with the left table, this row will be kept; otherwise, the rows in the right table are omitted from the output. We can fill in any missing values with <code>NULL</code>.</p>
+<div style="text-align: center;">
+<p><img src="images/left.png" alt="left" width="800"></p>
+</div>
+<p>A <strong>right outer join</strong> keeps all rows in the right table. Rows in the left table are only kept if they share a match in the right table. Again, we can fill in any missing values with <code>NULL</code>.</p>
+<div style="text-align: center;">
+<p><img src="images/right.png" alt="right" width="800"></p>
+</div>
+<p>In a <strong>full outer join</strong>, all rows that have a match between the two tables are joined together. If a row has no match in the second table, then the values of the columns for that second table are filled with <code>NULL</code>. In other words, a full outer join performs an inner join <em>while still keeping</em> rows that have no match in the other table. This is best understood visually:</p>
+<div style="text-align: center;">
+<p><img src="images/full.png" alt="full" width="800"></p>
+</div>
+<p>We have kept the same output achieved using an inner join, with the addition of partially null rows for entries in <code>s</code> and <code>t</code> that had no match in the second table.</p>
+<section id="aliasing-in-joins" class="level3" data-number="21.5.1">
+<h3 data-number="21.5.1" class="anchored" data-anchor-id="aliasing-in-joins"><span class="header-section-number">21.5.1</span> Aliasing in <code>JOIN</code>s</h3>
+<p>When joining tables, we often create aliases for table names (similarly to what we did with column names in the last lecture). We do this as it is typically easier to refer to aliases, especially when we are working with long table names. We can even reference columns using aliased table names!</p>
+<p>Let’s say we want to determine the average rating of various movies. We’ll need to <code>JOIN</code> the <code>Title</code> and <code>Rating</code> tables and can create aliases for both tables.</p>
+<div id="ae6478e3" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="20">
+<div class="sourceCode cell-code" id="cb39"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb39-3"><a href="#cb39-3" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle, averageRating</span>
+<span id="cb39-4"><a href="#cb39-4" aria-hidden="true" tabindex="-1"></a>FROM Title AS T INNER JOIN Rating AS R</span>
+<span id="cb39-5"><a href="#cb39-5" aria-hidden="true" tabindex="-1"></a>ON T.tconst <span class="op">=</span> R.tconst<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "system.information_schema.tables"?
+LINE 2: FROM Title AS T INNER JOIN Rating AS R
+             ^
+[SQL: SELECT primaryTitle, averageRating
+FROM Title AS T INNER JOIN Rating AS R
+ON T.tconst = R.tconst;]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+<p>Note that the <code>AS</code> is actually optional! We can create aliases for our tables even without it, but we usually include it for clarity.</p>
+<div id="861f1d7c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="21">
+<div class="sourceCode cell-code" id="cb41"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb41-2"><a href="#cb41-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb41-3"><a href="#cb41-3" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle, averageRating</span>
+<span id="cb41-4"><a href="#cb41-4" aria-hidden="true" tabindex="-1"></a>FROM Title T INNER JOIN Rating R</span>
+<span id="cb41-5"><a href="#cb41-5" aria-hidden="true" tabindex="-1"></a>ON T.tconst <span class="op">=</span> R.tconst<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "system.information_schema.tables"?
+LINE 2: FROM Title T INNER JOIN Rating R
+             ^
+[SQL: SELECT primaryTitle, averageRating
+FROM Title T INNER JOIN Rating R
+ON T.tconst = R.tconst;]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+</section>
+<section id="common-table-expressions" class="level3" data-number="21.5.2">
+<h3 data-number="21.5.2" class="anchored" data-anchor-id="common-table-expressions"><span class="header-section-number">21.5.2</span> Common Table Expressions</h3>
+<p>For more sophisticated data problems, the queries can become very complex. Common table expressions (CTEs) allow us to break down these complex queries into more manageable parts. To do so, we create temporary tables corresponding to different aspects of the problem and then reference them in the final query:</p>
+<pre><code>WITH 
+table_name1 AS ( 
+    SELECT ...
+),
+table_name2 AS ( 
+    SELECT ...
+)
+SELECT ... 
+FROM 
+table_name1, 
+table_name2, ...</code></pre>
+<p>Let’s say we want to identify the top 10 action movies that are highly rated (with an average rating greater than 7) and popular (having more than 5000 votes), along with the primary actors who are the most popular. We can use CTEs to break this query down into separate problems. Initially, we can filter to find good action movies and prolific actors separately. This way, in our final join, we only need to change the order.</p>
+<div id="9b80fad8" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="22">
+<div class="sourceCode cell-code" id="cb44"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb44-1"><a href="#cb44-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb44-2"><a href="#cb44-2" aria-hidden="true" tabindex="-1"></a>WITH </span>
+<span id="cb44-3"><a href="#cb44-3" aria-hidden="true" tabindex="-1"></a>good_action_movies AS (</span>
+<span id="cb44-4"><a href="#cb44-4" aria-hidden="true" tabindex="-1"></a>    SELECT <span class="op">*</span></span>
+<span id="cb44-5"><a href="#cb44-5" aria-hidden="true" tabindex="-1"></a>    FROM Title T JOIN Rating R ON T.tconst <span class="op">=</span> R.tconst  </span>
+<span id="cb44-6"><a href="#cb44-6" aria-hidden="true" tabindex="-1"></a>    WHERE genres LIKE <span class="st">'%Action%'</span> AND averageRating <span class="op">&gt;</span> <span class="dv">7</span> AND numVotes <span class="op">&gt;</span> <span class="dv">5000</span></span>
+<span id="cb44-7"><a href="#cb44-7" aria-hidden="true" tabindex="-1"></a>),</span>
+<span id="cb44-8"><a href="#cb44-8" aria-hidden="true" tabindex="-1"></a>prolific_actors AS (</span>
+<span id="cb44-9"><a href="#cb44-9" aria-hidden="true" tabindex="-1"></a>    SELECT N.nconst, primaryName, COUNT(<span class="op">*</span>) <span class="im">as</span> numRoles</span>
+<span id="cb44-10"><a href="#cb44-10" aria-hidden="true" tabindex="-1"></a>    FROM Name N JOIN Principal P ON N.nconst <span class="op">=</span> P.nconst</span>
+<span id="cb44-11"><a href="#cb44-11" aria-hidden="true" tabindex="-1"></a>    WHERE category <span class="op">=</span> <span class="st">'actor'</span></span>
+<span id="cb44-12"><a href="#cb44-12" aria-hidden="true" tabindex="-1"></a>    GROUP BY N.nconst, primaryName</span>
+<span id="cb44-13"><a href="#cb44-13" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb44-14"><a href="#cb44-14" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle, primaryName, numRoles, ROUND(averageRating) AS rating</span>
+<span id="cb44-15"><a href="#cb44-15" aria-hidden="true" tabindex="-1"></a>FROM good_action_movies m, prolific_actors a, principal p</span>
+<span id="cb44-16"><a href="#cb44-16" aria-hidden="true" tabindex="-1"></a>WHERE p.tconst <span class="op">=</span> m.tconst AND p.nconst <span class="op">=</span> a.nconst</span>
+<span id="cb44-17"><a href="#cb44-17" aria-hidden="true" tabindex="-1"></a>ORDER BY rating DESC, numRoles DESC</span>
+<span id="cb44-18"><a href="#cb44-18" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">10</span><span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code> * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "system.information_schema.tables"?
+LINE 4:     F...
+                 ^
+[SQL: WITH 
+good_action_movies AS (
+    SELECT *
+    FROM Title T JOIN Rating R ON T.tconst = R.tconst  
+    WHERE genres LIKE '%Action%' AND averageRating &gt; 7 AND numVotes &gt; 5000
+),
+prolific_actors AS (
+    SELECT N.nconst, primaryName, COUNT(*) as numRoles
+    FROM Name N JOIN Principal P ON N.nconst = P.nconst
+    WHERE category = 'actor'
+    GROUP BY N.nconst, primaryName
+)
+SELECT primaryTitle, primaryName, numRoles, ROUND(averageRating) AS rating
+FROM good_action_movies m, prolific_actors a, principal p
+WHERE p.tconst = m.tconst AND p.nconst = a.nconst
+ORDER BY rating DESC, numRoles DESC
+LIMIT 10;]
+(Background on this error at: https://sqlalche.me/e/20/f405)</code></pre>
+</div>
+</div>
+
+
+<!-- -->
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation column-body">
+  <div class="nav-page nav-page-previous">
+      <a href="../sql_I/sql_I.html" class="pagination-link" aria-label="SQL I">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../logistic_regression_1/logistic_reg_1.html" class="pagination-link" aria-label="Logistic Regression I">
+        <span class="nav-page-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb46" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb46-2"><a href="#cb46-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> SQL II</span></span>
+<span id="cb46-3"><a href="#cb46-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
+<span id="cb46-4"><a href="#cb46-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
+<span id="cb46-5"><a href="#cb46-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb46-6"><a href="#cb46-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb46-7"><a href="#cb46-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: false</span></span>
+<span id="cb46-8"><a href="#cb46-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb46-9"><a href="#cb46-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
+<span id="cb46-10"><a href="#cb46-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: SQL II</span></span>
+<span id="cb46-11"><a href="#cb46-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
+<span id="cb46-12"><a href="#cb46-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
+<span id="cb46-13"><a href="#cb46-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
+<span id="cb46-14"><a href="#cb46-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
+<span id="cb46-15"><a href="#cb46-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
+<span id="cb46-16"><a href="#cb46-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span></span>
+<span id="cb46-17"><a href="#cb46-17" aria-hidden="true" tabindex="-1"></a><span class="co">  jupytext:</span></span>
+<span id="cb46-18"><a href="#cb46-18" aria-hidden="true" tabindex="-1"></a><span class="co">    text_representation:</span></span>
+<span id="cb46-19"><a href="#cb46-19" aria-hidden="true" tabindex="-1"></a><span class="co">      extension: .qmd</span></span>
+<span id="cb46-20"><a href="#cb46-20" aria-hidden="true" tabindex="-1"></a><span class="co">      format_name: quarto</span></span>
+<span id="cb46-21"><a href="#cb46-21" aria-hidden="true" tabindex="-1"></a><span class="co">      format_version: '1.0'</span></span>
+<span id="cb46-22"><a href="#cb46-22" aria-hidden="true" tabindex="-1"></a><span class="co">      jupytext_version: 1.16.1</span></span>
+<span id="cb46-23"><a href="#cb46-23" aria-hidden="true" tabindex="-1"></a><span class="co">  kernelspec:</span></span>
+<span id="cb46-24"><a href="#cb46-24" aria-hidden="true" tabindex="-1"></a><span class="co">    display_name: Python 3 (ipykernel)</span></span>
+<span id="cb46-25"><a href="#cb46-25" aria-hidden="true" tabindex="-1"></a><span class="co">    language: python</span></span>
+<span id="cb46-26"><a href="#cb46-26" aria-hidden="true" tabindex="-1"></a><span class="co">    name: python3</span></span>
+<span id="cb46-27"><a href="#cb46-27" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb46-28"><a href="#cb46-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-29"><a href="#cb46-29" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
+<span id="cb46-30"><a href="#cb46-30" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
+<span id="cb46-31"><a href="#cb46-31" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Perform aggregations using <span class="in">`GROUP BY`</span></span>
+<span id="cb46-32"><a href="#cb46-32" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Introduce the ability to filter groups</span>
+<span id="cb46-33"><a href="#cb46-33" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Perform data cleaning and text manipulation in SQL</span>
+<span id="cb46-34"><a href="#cb46-34" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Join data across tables</span>
+<span id="cb46-35"><a href="#cb46-35" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb46-36"><a href="#cb46-36" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-37"><a href="#cb46-37" aria-hidden="true" tabindex="-1"></a>In this lecture, we'll continue our work from last time to introduce some advanced SQL syntax. </span>
+<span id="cb46-38"><a href="#cb46-38" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-39"><a href="#cb46-39" aria-hidden="true" tabindex="-1"></a>First, let's load in the <span class="in">`basic_examples.db`</span> database.</span>
+<span id="cb46-40"><a href="#cb46-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-43"><a href="#cb46-43" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-44"><a href="#cb46-44" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
+<span id="cb46-45"><a href="#cb46-45" aria-hidden="true" tabindex="-1"></a><span class="co"># Load the SQL Alchemy Python library and DuckDB</span></span>
+<span id="cb46-46"><a href="#cb46-46" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sqlalchemy</span>
+<span id="cb46-47"><a href="#cb46-47" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> duckdb</span>
+<span id="cb46-48"><a href="#cb46-48" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-49"><a href="#cb46-49" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-52"><a href="#cb46-52" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-53"><a href="#cb46-53" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-54"><a href="#cb46-54" aria-hidden="true" tabindex="-1"></a><span class="co"># Load %%sql cell magic</span></span>
+<span id="cb46-55"><a href="#cb46-55" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>load_ext sql</span>
+<span id="cb46-56"><a href="#cb46-56" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-57"><a href="#cb46-57" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-60"><a href="#cb46-60" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-61"><a href="#cb46-61" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-62"><a href="#cb46-62" aria-hidden="true" tabindex="-1"></a><span class="co"># Connect to the database</span></span>
+<span id="cb46-63"><a href="#cb46-63" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>sql duckdb:<span class="op">///</span>data<span class="op">/</span>basic_examples.db <span class="op">--</span>alias basic</span>
+<span id="cb46-64"><a href="#cb46-64" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-65"><a href="#cb46-65" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-66"><a href="#cb46-66" aria-hidden="true" tabindex="-1"></a><span class="fu">## Aggregating with `GROUP BY`</span></span>
+<span id="cb46-67"><a href="#cb46-67" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-68"><a href="#cb46-68" aria-hidden="true" tabindex="-1"></a>At this point, we've seen that SQL offers much of the same functionality that was given to us by <span class="in">`pandas`</span>. We can extract data from a table, filter it, and reorder it to suit our needs.</span>
+<span id="cb46-69"><a href="#cb46-69" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-70"><a href="#cb46-70" aria-hidden="true" tabindex="-1"></a>In <span class="in">`pandas`</span>, much of our analysis work relied heavily on being able to use <span class="in">`.groupby()`</span> to aggregate across the rows of our dataset. SQL's answer to this task is the (very conveniently named) <span class="in">`GROUP BY`</span> clause. While the outputs of <span class="in">`GROUP BY`</span> are similar to those of <span class="in">`.groupby()`</span> —— in both cases, we obtain an output table where some column has been used for grouping —— the syntax and logic used to group data in SQL are fairly different to the <span class="in">`pandas`</span> implementation.</span>
+<span id="cb46-71"><a href="#cb46-71" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-72"><a href="#cb46-72" aria-hidden="true" tabindex="-1"></a>To illustrate <span class="in">`GROUP BY`</span>, we will consider the <span class="in">`Dish`</span> table from our database.</span>
+<span id="cb46-73"><a href="#cb46-73" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-76"><a href="#cb46-76" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-77"><a href="#cb46-77" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-78"><a href="#cb46-78" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-79"><a href="#cb46-79" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span> </span>
+<span id="cb46-80"><a href="#cb46-80" aria-hidden="true" tabindex="-1"></a>FROM Dish<span class="op">;</span></span>
+<span id="cb46-81"><a href="#cb46-81" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-82"><a href="#cb46-82" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-83"><a href="#cb46-83" aria-hidden="true" tabindex="-1"></a>Notice that there are multiple dishes of the same <span class="in">`type`</span>. What if we wanted to find the total costs of dishes of a certain <span class="in">`type`</span>? To accomplish this, we would write the following code.</span>
+<span id="cb46-84"><a href="#cb46-84" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-87"><a href="#cb46-87" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-88"><a href="#cb46-88" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-89"><a href="#cb46-89" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-90"><a href="#cb46-90" aria-hidden="true" tabindex="-1"></a>SELECT <span class="bu">type</span>, SUM(cost)</span>
+<span id="cb46-91"><a href="#cb46-91" aria-hidden="true" tabindex="-1"></a>FROM Dish</span>
+<span id="cb46-92"><a href="#cb46-92" aria-hidden="true" tabindex="-1"></a>GROUP BY <span class="bu">type</span><span class="op">;</span></span>
+<span id="cb46-93"><a href="#cb46-93" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-94"><a href="#cb46-94" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-95"><a href="#cb46-95" aria-hidden="true" tabindex="-1"></a>What is going on here? The statement <span class="in">`GROUP BY type`</span> tells SQL to group the data based on the value contained in the <span class="in">`type`</span> column (whether a record is an appetizer, entree, or dessert). <span class="in">`SUM(cost)`</span> sums up the costs of dishes in each <span class="in">`type`</span> and displays the result in the output table.</span>
+<span id="cb46-96"><a href="#cb46-96" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-97"><a href="#cb46-97" aria-hidden="true" tabindex="-1"></a>You may be wondering: why does <span class="in">`SUM(cost)`</span> come before the command to <span class="in">`GROUP BY type`</span>? Don't we need to form groups before we can count the number of entries in each? Remember that SQL is a *declarative* programming language —— a SQL programmer simply states what end result they would like to see, and leaves the task of figuring out *how* to obtain this result to SQL itself. This means that SQL queries sometimes don't follow what a reader sees as a "logical" sequence of thought. Instead, SQL requires that we follow its set order of operations when constructing queries. So long as we follow this order, SQL will handle the underlying logic.</span>
+<span id="cb46-98"><a href="#cb46-98" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-99"><a href="#cb46-99" aria-hidden="true" tabindex="-1"></a>In practical terms: our goal with this query was to output the total <span class="in">`cost`</span>s of each <span class="in">`type`</span>. To communicate this to SQL, we say that we want to <span class="in">`SELECT`</span> the <span class="in">`SUM`</span>med <span class="in">`cost`</span> values for each <span class="in">`type`</span> group. </span>
+<span id="cb46-100"><a href="#cb46-100" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-101"><a href="#cb46-101" aria-hidden="true" tabindex="-1"></a>There are many aggregation functions that can be used to aggregate the data contained in each group. Some common examples are:</span>
+<span id="cb46-102"><a href="#cb46-102" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-103"><a href="#cb46-103" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`COUNT`</span>: count the number of rows associated with each group</span>
+<span id="cb46-104"><a href="#cb46-104" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`MIN`</span>: find the minimum value of each group</span>
+<span id="cb46-105"><a href="#cb46-105" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`MAX`</span>: find the maximum value of each group</span>
+<span id="cb46-106"><a href="#cb46-106" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`SUM`</span>: sum across all records in each group</span>
+<span id="cb46-107"><a href="#cb46-107" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span><span class="in">`AVG`</span>: find the average value of each group</span>
+<span id="cb46-108"><a href="#cb46-108" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-109"><a href="#cb46-109" aria-hidden="true" tabindex="-1"></a>We can easily compute multiple aggregations all at once (a task that was very tricky in <span class="in">`pandas`</span>).</span>
+<span id="cb46-110"><a href="#cb46-110" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-113"><a href="#cb46-113" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-114"><a href="#cb46-114" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-115"><a href="#cb46-115" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-116"><a href="#cb46-116" aria-hidden="true" tabindex="-1"></a>SELECT <span class="bu">type</span>, SUM(cost), MIN(cost), MAX(name)</span>
+<span id="cb46-117"><a href="#cb46-117" aria-hidden="true" tabindex="-1"></a>FROM Dish</span>
+<span id="cb46-118"><a href="#cb46-118" aria-hidden="true" tabindex="-1"></a>GROUP BY <span class="bu">type</span><span class="op">;</span></span>
+<span id="cb46-119"><a href="#cb46-119" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-120"><a href="#cb46-120" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-121"><a href="#cb46-121" aria-hidden="true" tabindex="-1"></a>To count the number of rows associated with each group, we use the <span class="in">`COUNT`</span> keyword. Calling <span class="in">`COUNT(*)`</span> will compute the total number of rows in each group, including rows with null values. Its <span class="in">`pandas`</span> equivalent is <span class="in">`.groupby().size()`</span>.</span>
+<span id="cb46-122"><a href="#cb46-122" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-123"><a href="#cb46-123" aria-hidden="true" tabindex="-1"></a>Recall the <span class="in">`Dragon`</span> table from the previous lecture:</span>
+<span id="cb46-124"><a href="#cb46-124" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-127"><a href="#cb46-127" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-128"><a href="#cb46-128" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-129"><a href="#cb46-129" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-130"><a href="#cb46-130" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span> FROM Dragon<span class="op">;</span></span>
+<span id="cb46-131"><a href="#cb46-131" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-132"><a href="#cb46-132" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-133"><a href="#cb46-133" aria-hidden="true" tabindex="-1"></a>Notice that <span class="in">`COUNT(*)`</span> and <span class="in">`COUNT(cute)`</span> result in different outputs.</span>
+<span id="cb46-134"><a href="#cb46-134" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-137"><a href="#cb46-137" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-138"><a href="#cb46-138" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-139"><a href="#cb46-139" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-140"><a href="#cb46-140" aria-hidden="true" tabindex="-1"></a>SELECT year, COUNT(<span class="op">*</span>)</span>
+<span id="cb46-141"><a href="#cb46-141" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb46-142"><a href="#cb46-142" aria-hidden="true" tabindex="-1"></a>GROUP BY year<span class="op">;</span></span>
+<span id="cb46-143"><a href="#cb46-143" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-144"><a href="#cb46-144" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-147"><a href="#cb46-147" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-148"><a href="#cb46-148" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-149"><a href="#cb46-149" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-150"><a href="#cb46-150" aria-hidden="true" tabindex="-1"></a>SELECT year, COUNT(cute)</span>
+<span id="cb46-151"><a href="#cb46-151" aria-hidden="true" tabindex="-1"></a>FROM Dragon</span>
+<span id="cb46-152"><a href="#cb46-152" aria-hidden="true" tabindex="-1"></a>GROUP BY year<span class="op">;</span></span>
+<span id="cb46-153"><a href="#cb46-153" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-154"><a href="#cb46-154" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-155"><a href="#cb46-155" aria-hidden="true" tabindex="-1"></a>With this definition of <span class="in">`GROUP BY`</span> in hand, let's update our SQL order of operations. Remember: *every* SQL query must list clauses in this order. </span>
+<span id="cb46-156"><a href="#cb46-156" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-157"><a href="#cb46-157" aria-hidden="true" tabindex="-1"></a><span class="in">    SELECT &lt;column expression list&gt;</span></span>
+<span id="cb46-158"><a href="#cb46-158" aria-hidden="true" tabindex="-1"></a><span class="in">    FROM &lt;table&gt;</span></span>
+<span id="cb46-159"><a href="#cb46-159" aria-hidden="true" tabindex="-1"></a><span class="in">    [WHERE &lt;predicate&gt;]</span></span>
+<span id="cb46-160"><a href="#cb46-160" aria-hidden="true" tabindex="-1"></a><span class="in">    [GROUP BY &lt;column list&gt;]</span></span>
+<span id="cb46-161"><a href="#cb46-161" aria-hidden="true" tabindex="-1"></a><span class="in">    [ORDER BY &lt;column list&gt;]</span></span>
+<span id="cb46-162"><a href="#cb46-162" aria-hidden="true" tabindex="-1"></a><span class="in">    [LIMIT &lt;number of rows&gt;]</span></span>
+<span id="cb46-163"><a href="#cb46-163" aria-hidden="true" tabindex="-1"></a><span class="in">    [OFFSET &lt;number of rows&gt;];</span></span>
+<span id="cb46-164"><a href="#cb46-164" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-165"><a href="#cb46-165" aria-hidden="true" tabindex="-1"></a>Note that we can use the <span class="in">`AS`</span> keyword to rename columns during the selection process and that column expressions may include aggregation functions (<span class="in">`MAX`</span>, <span class="in">`MIN`</span>, etc.).</span>
+<span id="cb46-166"><a href="#cb46-166" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-167"><a href="#cb46-167" aria-hidden="true" tabindex="-1"></a><span class="fu">## Filtering Groups</span></span>
+<span id="cb46-168"><a href="#cb46-168" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-169"><a href="#cb46-169" aria-hidden="true" tabindex="-1"></a>Now, what if we only want groups that meet a certain condition? <span class="in">`HAVING`</span> filters groups by applying some condition across all rows in each group. We interpret it as a way to keep only the groups <span class="in">`HAVING`</span> some condition. Note the difference between <span class="in">`WHERE`</span> and <span class="in">`HAVING`</span>: we use <span class="in">`WHERE`</span> to filter rows, whereas we use <span class="in">`HAVING`</span> to filter *groups*. <span class="in">`WHERE`</span> precedes <span class="in">`HAVING`</span> in terms of how SQL executes a query.</span>
+<span id="cb46-170"><a href="#cb46-170" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-171"><a href="#cb46-171" aria-hidden="true" tabindex="-1"></a>Let's take a look at the <span class="in">`Dish`</span> table to see how we can use <span class="in">`HAVING`</span>. Say we want to group dishes with a cost greater than 4 by <span class="in">`type`</span> and only keep groups where the max cost is less than 10.</span>
+<span id="cb46-172"><a href="#cb46-172" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-175"><a href="#cb46-175" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-176"><a href="#cb46-176" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-177"><a href="#cb46-177" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-178"><a href="#cb46-178" aria-hidden="true" tabindex="-1"></a>SELECT <span class="bu">type</span>, COUNT(<span class="op">*</span>)</span>
+<span id="cb46-179"><a href="#cb46-179" aria-hidden="true" tabindex="-1"></a>FROM Dish</span>
+<span id="cb46-180"><a href="#cb46-180" aria-hidden="true" tabindex="-1"></a>WHERE cost <span class="op">&gt;</span> <span class="dv">4</span></span>
+<span id="cb46-181"><a href="#cb46-181" aria-hidden="true" tabindex="-1"></a>GROUP BY <span class="bu">type</span></span>
+<span id="cb46-182"><a href="#cb46-182" aria-hidden="true" tabindex="-1"></a>HAVING MAX(cost) <span class="op">&lt;</span>  <span class="dv">10</span><span class="op">;</span></span>
+<span id="cb46-183"><a href="#cb46-183" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-184"><a href="#cb46-184" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-185"><a href="#cb46-185" aria-hidden="true" tabindex="-1"></a>Here, we first use <span class="in">`WHERE`</span> to filter for rows with a cost greater than 4. We then group our values by <span class="in">`type`</span> before applying the <span class="in">`HAVING`</span> operator. With <span class="in">`HAVING`</span>, we can filter our groups based on if the max cost is less than 10.</span>
+<span id="cb46-186"><a href="#cb46-186" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-187"><a href="#cb46-187" aria-hidden="true" tabindex="-1"></a><span class="fu">## Summary: SQL</span></span>
+<span id="cb46-188"><a href="#cb46-188" aria-hidden="true" tabindex="-1"></a>With this definition of <span class="in">`GROUP BY`</span> and <span class="in">`HAVING`</span> in hand, let's update our SQL order of operations. Remember: *every* SQL query must list clauses in this order. </span>
+<span id="cb46-189"><a href="#cb46-189" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-190"><a href="#cb46-190" aria-hidden="true" tabindex="-1"></a><span class="in">    SELECT &lt;column expression list&gt;</span></span>
+<span id="cb46-191"><a href="#cb46-191" aria-hidden="true" tabindex="-1"></a><span class="in">    FROM &lt;table&gt;</span></span>
+<span id="cb46-192"><a href="#cb46-192" aria-hidden="true" tabindex="-1"></a><span class="in">    [WHERE &lt;predicate&gt;]</span></span>
+<span id="cb46-193"><a href="#cb46-193" aria-hidden="true" tabindex="-1"></a><span class="in">    [GROUP BY &lt;column list&gt;]</span></span>
+<span id="cb46-194"><a href="#cb46-194" aria-hidden="true" tabindex="-1"></a><span class="in">    [ORDER BY &lt;column list&gt;]</span></span>
+<span id="cb46-195"><a href="#cb46-195" aria-hidden="true" tabindex="-1"></a><span class="in">    [LIMIT &lt;number of rows&gt;]</span></span>
+<span id="cb46-196"><a href="#cb46-196" aria-hidden="true" tabindex="-1"></a><span class="in">    [OFFSET &lt;number of rows&gt;];</span></span>
+<span id="cb46-197"><a href="#cb46-197" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-198"><a href="#cb46-198" aria-hidden="true" tabindex="-1"></a>Note that we can use the <span class="in">`AS`</span> keyword to rename columns during the selection process and that column expressions may include aggregation functions (<span class="in">`MAX`</span>, <span class="in">`MIN`</span>, etc.).</span>
+<span id="cb46-199"><a href="#cb46-199" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-200"><a href="#cb46-200" aria-hidden="true" tabindex="-1"></a><span class="fu">## EDA in SQL</span></span>
+<span id="cb46-201"><a href="#cb46-201" aria-hidden="true" tabindex="-1"></a>In the last lecture, we mostly worked under the assumption that our data had already been cleaned. However, as we saw in our first pass through the data science lifecycle, we're very unlikely to be given data that is free of formatting issues. With this in mind, we'll want to learn how to clean and transform data in SQL. </span>
+<span id="cb46-202"><a href="#cb46-202" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-203"><a href="#cb46-203" aria-hidden="true" tabindex="-1"></a>Our typical workflow when working with "big data" is:</span>
+<span id="cb46-204"><a href="#cb46-204" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-205"><a href="#cb46-205" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Use SQL to query data from a database</span>
+<span id="cb46-206"><a href="#cb46-206" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Use Python (with <span class="in">`pandas`</span>) to analyze this data in detail</span>
+<span id="cb46-207"><a href="#cb46-207" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-208"><a href="#cb46-208" aria-hidden="true" tabindex="-1"></a>We can, however, still perform simple data cleaning and re-structuring using SQL directly. To do so, we'll use the <span class="in">`Title`</span> table from the <span class="in">`imdb_duck`</span> database, which contains information about movies and actors.</span>
+<span id="cb46-209"><a href="#cb46-209" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-210"><a href="#cb46-210" aria-hidden="true" tabindex="-1"></a>Let's load in the <span class="in">`imdb_duck`</span> database.</span>
+<span id="cb46-211"><a href="#cb46-211" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-214"><a href="#cb46-214" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-215"><a href="#cb46-215" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-216"><a href="#cb46-216" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
+<span id="cb46-217"><a href="#cb46-217" aria-hidden="true" tabindex="-1"></a>os.environ[<span class="st">"TQDM_DISABLE"</span>] <span class="op">=</span> <span class="st">"1"</span></span>
+<span id="cb46-218"><a href="#cb46-218" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> os.path.exists(<span class="st">"/home/jovyan/shared/sql/imdb_duck.db"</span>):</span>
+<span id="cb46-219"><a href="#cb46-219" aria-hidden="true" tabindex="-1"></a>    imdbpath <span class="op">=</span> <span class="st">"duckdb:////home/jovyan/shared/sql/imdb_duck.db"</span></span>
+<span id="cb46-220"><a href="#cb46-220" aria-hidden="true" tabindex="-1"></a><span class="cf">elif</span> os.path.exists(<span class="st">"data/imdb_duck.db"</span>):</span>
+<span id="cb46-221"><a href="#cb46-221" aria-hidden="true" tabindex="-1"></a>    imdbpath <span class="op">=</span>  <span class="st">"duckdb:///data/imdb_duck.db"</span></span>
+<span id="cb46-222"><a href="#cb46-222" aria-hidden="true" tabindex="-1"></a><span class="cf">else</span>:</span>
+<span id="cb46-223"><a href="#cb46-223" aria-hidden="true" tabindex="-1"></a>    <span class="im">import</span> gdown</span>
+<span id="cb46-224"><a href="#cb46-224" aria-hidden="true" tabindex="-1"></a>    url <span class="op">=</span> <span class="st">'https://drive.google.com/uc?id=10tKOHGLt9QoOgq5Ii-FhxpB9lDSQgl1O'</span></span>
+<span id="cb46-225"><a href="#cb46-225" aria-hidden="true" tabindex="-1"></a>    output_path <span class="op">=</span> <span class="st">'data/imdb_duck.db'</span></span>
+<span id="cb46-226"><a href="#cb46-226" aria-hidden="true" tabindex="-1"></a>    gdown.download(url, output_path, quiet<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb46-227"><a href="#cb46-227" aria-hidden="true" tabindex="-1"></a>    imdbpath <span class="op">=</span> <span class="st">"duckdb:///data/imdb_duck.db"</span></span>
+<span id="cb46-228"><a href="#cb46-228" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-229"><a href="#cb46-229" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-232"><a href="#cb46-232" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-233"><a href="#cb46-233" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-234"><a href="#cb46-234" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sqlalchemy <span class="im">import</span> create_engine</span>
+<span id="cb46-235"><a href="#cb46-235" aria-hidden="true" tabindex="-1"></a>imdb_engine <span class="op">=</span> create_engine(imdbpath, connect_args<span class="op">=</span>{<span class="st">'read_only'</span>: <span class="va">True</span>})</span>
+<span id="cb46-236"><a href="#cb46-236" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>sql imdb_engine <span class="op">--</span>alias imdb</span>
+<span id="cb46-237"><a href="#cb46-237" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-238"><a href="#cb46-238" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-239"><a href="#cb46-239" aria-hidden="true" tabindex="-1"></a>Since we'll be working with the <span class="in">`Title`</span> table, let's take a quick look at what it contains. </span>
+<span id="cb46-240"><a href="#cb46-240" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-243"><a href="#cb46-243" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-244"><a href="#cb46-244" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-245"><a href="#cb46-245" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql imdb </span>
+<span id="cb46-246"><a href="#cb46-246" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb46-247"><a href="#cb46-247" aria-hidden="true" tabindex="-1"></a>SELECT <span class="op">*</span></span>
+<span id="cb46-248"><a href="#cb46-248" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb46-249"><a href="#cb46-249" aria-hidden="true" tabindex="-1"></a>WHERE primaryTitle IN (<span class="st">'Ginny &amp; Georgia'</span>, <span class="st">'What If...?'</span>, <span class="st">'Succession'</span>, <span class="st">'Veep'</span>, <span class="st">'Tenet'</span>)</span>
+<span id="cb46-250"><a href="#cb46-250" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">10</span><span class="op">;</span></span>
+<span id="cb46-251"><a href="#cb46-251" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-252"><a href="#cb46-252" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-253"><a href="#cb46-253" aria-hidden="true" tabindex="-1"></a><span class="fu">### Matching Text using `LIKE`</span></span>
+<span id="cb46-254"><a href="#cb46-254" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-255"><a href="#cb46-255" aria-hidden="true" tabindex="-1"></a>One common task we encountered in our first look at EDA was needing to match string data. For example, we might want to remove entries beginning with the same prefix as part of the data cleaning process.</span>
+<span id="cb46-256"><a href="#cb46-256" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-257"><a href="#cb46-257" aria-hidden="true" tabindex="-1"></a>In SQL, we use the <span class="in">`LIKE`</span> operator to (you guessed it) look for strings that are *like* a given string pattern. </span>
+<span id="cb46-258"><a href="#cb46-258" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-261"><a href="#cb46-261" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-262"><a href="#cb46-262" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-263"><a href="#cb46-263" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-264"><a href="#cb46-264" aria-hidden="true" tabindex="-1"></a>SELECT titleType, primaryTitle</span>
+<span id="cb46-265"><a href="#cb46-265" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb46-266"><a href="#cb46-266" aria-hidden="true" tabindex="-1"></a>WHERE primaryTitle LIKE <span class="st">'Star Wars: Episode I - The Phantom Menace'</span></span>
+<span id="cb46-267"><a href="#cb46-267" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-268"><a href="#cb46-268" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-269"><a href="#cb46-269" aria-hidden="true" tabindex="-1"></a>What if we wanted to find *all* Star Wars movies? <span class="in">`%`</span> is the wildcard operator, it means "look for any character, any number of times". This makes it helpful for identifying strings that are similar to our desired pattern, even when we don't know the full text of what we aim to extract.</span>
+<span id="cb46-270"><a href="#cb46-270" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-273"><a href="#cb46-273" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-274"><a href="#cb46-274" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-275"><a href="#cb46-275" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-276"><a href="#cb46-276" aria-hidden="true" tabindex="-1"></a>SELECT titleType, primaryTitle</span>
+<span id="cb46-277"><a href="#cb46-277" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb46-278"><a href="#cb46-278" aria-hidden="true" tabindex="-1"></a>WHERE primaryTitle LIKE <span class="st">'%Star Wars%'</span></span>
+<span id="cb46-279"><a href="#cb46-279" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">10</span><span class="op">;</span></span>
+<span id="cb46-280"><a href="#cb46-280" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-281"><a href="#cb46-281" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-282"><a href="#cb46-282" aria-hidden="true" tabindex="-1"></a>Alternatively, we can use RegEx! DuckDB and most real DBMSs allow for this. Note that here, we have to use the <span class="in">`SIMILAR TO`</span> operater rather than <span class="in">`LIKE`</span>.</span>
+<span id="cb46-283"><a href="#cb46-283" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-286"><a href="#cb46-286" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-287"><a href="#cb46-287" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-288"><a href="#cb46-288" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-289"><a href="#cb46-289" aria-hidden="true" tabindex="-1"></a>SELECT titleType, primaryTitle</span>
+<span id="cb46-290"><a href="#cb46-290" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb46-291"><a href="#cb46-291" aria-hidden="true" tabindex="-1"></a>WHERE primaryTitle SIMILAR TO <span class="st">'.*Star Wars*.'</span></span>
+<span id="cb46-292"><a href="#cb46-292" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">10</span><span class="op">;</span></span>
+<span id="cb46-293"><a href="#cb46-293" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-294"><a href="#cb46-294" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-295"><a href="#cb46-295" aria-hidden="true" tabindex="-1"></a><span class="fu">### `CAST`ing Data Types</span></span>
+<span id="cb46-296"><a href="#cb46-296" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-297"><a href="#cb46-297" aria-hidden="true" tabindex="-1"></a>A common data cleaning task is converting data to the correct variable type. The <span class="in">`CAST`</span> keyword is used to generate a new output column. Each entry in this output column is the result of converting the data in an existing column to a new data type. For example, we may wish to convert numeric data stored as a string to an integer.</span>
+<span id="cb46-298"><a href="#cb46-298" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-301"><a href="#cb46-301" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-302"><a href="#cb46-302" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-303"><a href="#cb46-303" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-304"><a href="#cb46-304" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle, CAST(runtimeMinutes AS INT)</span>
+<span id="cb46-305"><a href="#cb46-305" aria-hidden="true" tabindex="-1"></a>FROM Title<span class="op">;</span></span>
+<span id="cb46-306"><a href="#cb46-306" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-307"><a href="#cb46-307" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-308"><a href="#cb46-308" aria-hidden="true" tabindex="-1"></a>We use <span class="in">`CAST`</span> when <span class="in">`SELECT`</span>ing colunns for our output table. In the example above, we want to <span class="in">`SELECT`</span> the columns of integer year and runtime data that is created by the <span class="in">`CAST`</span>. </span>
+<span id="cb46-309"><a href="#cb46-309" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-310"><a href="#cb46-310" aria-hidden="true" tabindex="-1"></a>SQL will automatically name a new column according to the command used to <span class="in">`SELECT`</span> it, which can lead to unwieldy column names. We can rename the <span class="in">`CAST`</span>ed column using the <span class="in">`AS`</span> keyword.</span>
+<span id="cb46-311"><a href="#cb46-311" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-314"><a href="#cb46-314" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-315"><a href="#cb46-315" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-316"><a href="#cb46-316" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-317"><a href="#cb46-317" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle AS title, CAST(runtimeMinutes AS INT) AS minutes, CAST(startYear AS INT) AS year</span>
+<span id="cb46-318"><a href="#cb46-318" aria-hidden="true" tabindex="-1"></a>FROM Title</span>
+<span id="cb46-319"><a href="#cb46-319" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">5</span><span class="op">;</span></span>
+<span id="cb46-320"><a href="#cb46-320" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-321"><a href="#cb46-321" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-322"><a href="#cb46-322" aria-hidden="true" tabindex="-1"></a><span class="fu">### Using Conditional Statements with `CASE`</span></span>
+<span id="cb46-323"><a href="#cb46-323" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-324"><a href="#cb46-324" aria-hidden="true" tabindex="-1"></a>When working with <span class="in">`pandas`</span>, we often ran into situations where we wanted to generate new columns using some form of conditional statement. For example, say we wanted to describe a film title as "old," "mid-aged," or "new," depending on the year of its release.</span>
+<span id="cb46-325"><a href="#cb46-325" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-326"><a href="#cb46-326" aria-hidden="true" tabindex="-1"></a>In SQL, conditional operations are performed using a <span class="in">`CASE`</span> clause. Conceptually, <span class="in">`CASE`</span> behaves much like the <span class="in">`CAST`</span> operation: it creates a new column that we can then <span class="in">`SELECT`</span> to appear in the output. The syntax for a <span class="in">`CASE`</span> clause is as follows:</span>
+<span id="cb46-327"><a href="#cb46-327" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-328"><a href="#cb46-328" aria-hidden="true" tabindex="-1"></a><span class="in">    CASE WHEN &lt;condition&gt; THEN &lt;value&gt;</span></span>
+<span id="cb46-329"><a href="#cb46-329" aria-hidden="true" tabindex="-1"></a><span class="in">         WHEN &lt;other condition&gt; THEN &lt;other value&gt;</span></span>
+<span id="cb46-330"><a href="#cb46-330" aria-hidden="true" tabindex="-1"></a><span class="in">         ...</span></span>
+<span id="cb46-331"><a href="#cb46-331" aria-hidden="true" tabindex="-1"></a><span class="in">         ELSE &lt;yet another value&gt;</span></span>
+<span id="cb46-332"><a href="#cb46-332" aria-hidden="true" tabindex="-1"></a><span class="in">         END</span></span>
+<span id="cb46-333"><a href="#cb46-333" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-334"><a href="#cb46-334" aria-hidden="true" tabindex="-1"></a>Scanning through the skeleton code above, you can see that the logic is similar to that of an <span class="in">`if`</span> statement in Python. The conditional statement is first opened by calling <span class="in">`CASE`</span>. Each new condition is specified by <span class="in">`WHEN`</span>, with <span class="in">`THEN`</span> indicating what value should be filled if the condition is met. <span class="in">`ELSE`</span> specifies the value that should be filled if no other conditions are met. Lastly, <span class="in">`END`</span> indicates the end of the conditional statement; once <span class="in">`END`</span> has been called, SQL will continue evaluating the query as usual. </span>
+<span id="cb46-335"><a href="#cb46-335" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-336"><a href="#cb46-336" aria-hidden="true" tabindex="-1"></a>Let's see this in action. In the example below, we give the new column created by the <span class="in">`CASE`</span> statement the name <span class="in">`movie_age`</span>.</span>
+<span id="cb46-337"><a href="#cb46-337" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-340"><a href="#cb46-340" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-341"><a href="#cb46-341" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-342"><a href="#cb46-342" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-343"><a href="#cb46-343" aria-hidden="true" tabindex="-1"></a><span class="op">/*</span> If a movie was filmed before <span class="dv">1950</span>, it <span class="kw">is</span> <span class="st">"old"</span></span>
+<span id="cb46-344"><a href="#cb46-344" aria-hidden="true" tabindex="-1"></a>Otherwise, <span class="cf">if</span> a movie was filmed before <span class="dv">2000</span>, it <span class="kw">is</span> <span class="st">"mid-aged"</span></span>
+<span id="cb46-345"><a href="#cb46-345" aria-hidden="true" tabindex="-1"></a>Else, a movie <span class="kw">is</span> <span class="st">"new"</span> <span class="op">*/</span></span>
+<span id="cb46-346"><a href="#cb46-346" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-347"><a href="#cb46-347" aria-hidden="true" tabindex="-1"></a>SELECT titleType, startYear,</span>
+<span id="cb46-348"><a href="#cb46-348" aria-hidden="true" tabindex="-1"></a>CASE WHEN startYear <span class="op">&lt;</span> <span class="dv">1950</span> THEN <span class="st">'old'</span></span>
+<span id="cb46-349"><a href="#cb46-349" aria-hidden="true" tabindex="-1"></a>     WHEN startYear <span class="op">&lt;</span> <span class="dv">2000</span> THEN <span class="st">'mid-aged'</span></span>
+<span id="cb46-350"><a href="#cb46-350" aria-hidden="true" tabindex="-1"></a>     ELSE <span class="st">'new'</span></span>
+<span id="cb46-351"><a href="#cb46-351" aria-hidden="true" tabindex="-1"></a>     END AS movie_age</span>
+<span id="cb46-352"><a href="#cb46-352" aria-hidden="true" tabindex="-1"></a>FROM Title<span class="op">;</span></span>
+<span id="cb46-353"><a href="#cb46-353" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-354"><a href="#cb46-354" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-355"><a href="#cb46-355" aria-hidden="true" tabindex="-1"></a><span class="fu">## `JOIN`ing Tables</span></span>
+<span id="cb46-356"><a href="#cb46-356" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-357"><a href="#cb46-357" aria-hidden="true" tabindex="-1"></a>At this point, we're well-versed in using SQL as a tool to clean, manipulate, and transform data in a table. Notice that this sentence referred to one *table*, specifically. What happens if the data we need is distributed across multiple tables? This is an important consideration when using SQL —— recall that we first introduced SQL as a language to query from databases. Databases often store data in a multidimensional structure. In other words, information is stored across several tables, with each table containing a small subset of all the data housed by the database. </span>
+<span id="cb46-358"><a href="#cb46-358" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-359"><a href="#cb46-359" aria-hidden="true" tabindex="-1"></a>A common way of organizing a database is by using a **star schema**. A star schema is composed of two types of tables. A **fact table** is the central table of the database —— it contains the information needed to link entries across several **dimension tables**, which contain more detailed information about the data. </span>
+<span id="cb46-360"><a href="#cb46-360" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-361"><a href="#cb46-361" aria-hidden="true" tabindex="-1"></a>Say we were working with a database about boba offerings in Berkeley. The dimension tables of the database might contain information about tea varieties and boba toppings. The fact table would be used to link this information across the various dimension tables.</span>
+<span id="cb46-362"><a href="#cb46-362" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-363"><a href="#cb46-363" aria-hidden="true" tabindex="-1"></a>&lt;div style="text-align: center;"&gt;</span>
+<span id="cb46-364"><a href="#cb46-364" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/multidimensional.png" alt='multidimensional' width='850'&gt;</span>
+<span id="cb46-365"><a href="#cb46-365" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt; </span>
+<span id="cb46-366"><a href="#cb46-366" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-367"><a href="#cb46-367" aria-hidden="true" tabindex="-1"></a>If we explicitly mark the relationships between tables, we start to see the star-like structure of the star schema.</span>
+<span id="cb46-368"><a href="#cb46-368" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-369"><a href="#cb46-369" aria-hidden="true" tabindex="-1"></a>&lt;div style="text-align: center;"&gt;</span>
+<span id="cb46-370"><a href="#cb46-370" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/star.png" alt='star' width='650'&gt;</span>
+<span id="cb46-371"><a href="#cb46-371" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt; </span>
+<span id="cb46-372"><a href="#cb46-372" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-373"><a href="#cb46-373" aria-hidden="true" tabindex="-1"></a>To join data across multiple tables, we'll use the (creatively named) <span class="in">`JOIN`</span> keyword. We'll make things easier for now by first considering the simpler <span class="in">`cats`</span> dataset, which consists of the tables <span class="in">`s`</span> and <span class="in">`t`</span>.</span>
+<span id="cb46-374"><a href="#cb46-374" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-375"><a href="#cb46-375" aria-hidden="true" tabindex="-1"></a>&lt;div style="text-align: center;"&gt;</span>
+<span id="cb46-376"><a href="#cb46-376" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/cats.png" alt='cats' width='500'&gt;</span>
+<span id="cb46-377"><a href="#cb46-377" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt; </span>
+<span id="cb46-378"><a href="#cb46-378" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-379"><a href="#cb46-379" aria-hidden="true" tabindex="-1"></a>To perform a join, we amend the <span class="in">`FROM`</span> clause. You can think of this as saying, "<span class="in">`SELECT`</span> my data <span class="in">`FROM`</span> tables that have  been <span class="in">`JOIN`</span>ed together." </span>
+<span id="cb46-380"><a href="#cb46-380" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-381"><a href="#cb46-381" aria-hidden="true" tabindex="-1"></a>Remember: SQL does not consider newlines or whitespace when interpreting queries. The indentation given in the example below is to help improve readability. If you wish, you can write code that does not follow this formatting.</span>
+<span id="cb46-382"><a href="#cb46-382" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-383"><a href="#cb46-383" aria-hidden="true" tabindex="-1"></a><span class="in">    SELECT &lt;column list&gt;</span></span>
+<span id="cb46-384"><a href="#cb46-384" aria-hidden="true" tabindex="-1"></a><span class="in">    FROM table_1 </span></span>
+<span id="cb46-385"><a href="#cb46-385" aria-hidden="true" tabindex="-1"></a><span class="in">        JOIN table_2 </span></span>
+<span id="cb46-386"><a href="#cb46-386" aria-hidden="true" tabindex="-1"></a><span class="in">        ON key_1 = key_2;</span></span>
+<span id="cb46-387"><a href="#cb46-387" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-388"><a href="#cb46-388" aria-hidden="true" tabindex="-1"></a>We also need to specify what column from each table should be used to determine matching entries. By defining these keys, we provide SQL with the information it needs to pair rows of data together.</span>
+<span id="cb46-389"><a href="#cb46-389" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-390"><a href="#cb46-390" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-391"><a href="#cb46-391" aria-hidden="true" tabindex="-1"></a>The most commonly used type of SQL <span class="in">`JOIN`</span> is the **inner join**. It turns out you're already familiar with what an inner join does, and how it works – this is the type of join we've been using in <span class="in">`pandas`</span> all along! In an inner join, we combine every row in our first table with its matching entry in the second table. If a row from either table does not have a match in the other table, it is omitted from the output. </span>
+<span id="cb46-392"><a href="#cb46-392" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-393"><a href="#cb46-393" aria-hidden="true" tabindex="-1"></a>&lt;div style="text-align: center;"&gt;</span>
+<span id="cb46-394"><a href="#cb46-394" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/inner.png" alt='inner' width='800'&gt;</span>
+<span id="cb46-395"><a href="#cb46-395" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt; </span>
+<span id="cb46-396"><a href="#cb46-396" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-397"><a href="#cb46-397" aria-hidden="true" tabindex="-1"></a>In a **cross join**, *all* possible combinations of rows appear in the output table, regardless of whether or not rows share a matching key. Because all rows are joined, even if there is no matching key, it is not necessary to specify what keys to consider in an <span class="in">`ON`</span> statement. A cross join is also known as a cartesian product.</span>
+<span id="cb46-398"><a href="#cb46-398" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-399"><a href="#cb46-399" aria-hidden="true" tabindex="-1"></a>&lt;div style="text-align: center;"&gt;</span>
+<span id="cb46-400"><a href="#cb46-400" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/cross.png" alt='cross' width='800'&gt;</span>
+<span id="cb46-401"><a href="#cb46-401" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt; </span>
+<span id="cb46-402"><a href="#cb46-402" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-403"><a href="#cb46-403" aria-hidden="true" tabindex="-1"></a>Conceptually, we can interpret an inner join as a cross join, followed by removing all rows that do not share a matching key. Notice that the output of the inner join above contains all rows of the cross join example that contain a single color across the entire row.</span>
+<span id="cb46-404"><a href="#cb46-404" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-405"><a href="#cb46-405" aria-hidden="true" tabindex="-1"></a>In a **left outer join**, *all* rows in the left table are kept in the output table. If a row in the right table shares a match with the left table, this row will be kept; otherwise, the rows in the right table are omitted from the output. We can fill in any missing values with <span class="in">`NULL`</span>.</span>
+<span id="cb46-406"><a href="#cb46-406" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-407"><a href="#cb46-407" aria-hidden="true" tabindex="-1"></a>&lt;div style="text-align: center;"&gt;</span>
+<span id="cb46-408"><a href="#cb46-408" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/left.png" alt='left' width='800'&gt;</span>
+<span id="cb46-409"><a href="#cb46-409" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt; </span>
+<span id="cb46-410"><a href="#cb46-410" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-411"><a href="#cb46-411" aria-hidden="true" tabindex="-1"></a>A **right outer join** keeps all rows in the right table. Rows in the left table are only kept if they share a match in the right table. Again, we can fill in any missing values with <span class="in">`NULL`</span>. </span>
+<span id="cb46-412"><a href="#cb46-412" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-413"><a href="#cb46-413" aria-hidden="true" tabindex="-1"></a>&lt;div style="text-align: center;"&gt;</span>
+<span id="cb46-414"><a href="#cb46-414" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/right.png" alt='right' width='800'&gt;</span>
+<span id="cb46-415"><a href="#cb46-415" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt; </span>
+<span id="cb46-416"><a href="#cb46-416" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-417"><a href="#cb46-417" aria-hidden="true" tabindex="-1"></a>In a **full outer join**, all rows that have a match between the two tables are joined together. If a row has no match in the second table, then the values of the columns for that second table are filled with <span class="in">`NULL`</span>. In other words, a full outer join performs an inner join *while still keeping* rows that have no match in the other table. This is best understood visually:</span>
+<span id="cb46-418"><a href="#cb46-418" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-419"><a href="#cb46-419" aria-hidden="true" tabindex="-1"></a>&lt;div style="text-align: center;"&gt;</span>
+<span id="cb46-420"><a href="#cb46-420" aria-hidden="true" tabindex="-1"></a>&lt;img src="images/full.png" alt='full' width='800'&gt;</span>
+<span id="cb46-421"><a href="#cb46-421" aria-hidden="true" tabindex="-1"></a>&lt;/div&gt; </span>
+<span id="cb46-422"><a href="#cb46-422" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-423"><a href="#cb46-423" aria-hidden="true" tabindex="-1"></a>We have kept the same output achieved using an inner join, with the addition of partially null rows for entries in <span class="in">`s`</span> and <span class="in">`t`</span> that had no match in the second table. </span>
+<span id="cb46-424"><a href="#cb46-424" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-425"><a href="#cb46-425" aria-hidden="true" tabindex="-1"></a><span class="fu">### Aliasing in `JOIN`s</span></span>
+<span id="cb46-426"><a href="#cb46-426" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-427"><a href="#cb46-427" aria-hidden="true" tabindex="-1"></a>When joining tables, we often create aliases for table names (similarly to what we did with column names in the last lecture). We do this as it is typically easier to refer to aliases, especially when we are working with long table names. We can even reference columns using aliased table names!</span>
+<span id="cb46-428"><a href="#cb46-428" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-429"><a href="#cb46-429" aria-hidden="true" tabindex="-1"></a>Let's say we want to determine the average rating of various movies. We'll need to <span class="in">`JOIN`</span> the <span class="in">`Title`</span> and <span class="in">`Rating`</span> tables and can create aliases for both tables.</span>
+<span id="cb46-430"><a href="#cb46-430" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-433"><a href="#cb46-433" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-434"><a href="#cb46-434" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-435"><a href="#cb46-435" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-436"><a href="#cb46-436" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-437"><a href="#cb46-437" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle, averageRating</span>
+<span id="cb46-438"><a href="#cb46-438" aria-hidden="true" tabindex="-1"></a>FROM Title AS T INNER JOIN Rating AS R</span>
+<span id="cb46-439"><a href="#cb46-439" aria-hidden="true" tabindex="-1"></a>ON T.tconst <span class="op">=</span> R.tconst<span class="op">;</span></span>
+<span id="cb46-440"><a href="#cb46-440" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-441"><a href="#cb46-441" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-442"><a href="#cb46-442" aria-hidden="true" tabindex="-1"></a>Note that the <span class="in">`AS`</span> is actually optional! We can create aliases for our tables even without it, but we usually include it for clarity.</span>
+<span id="cb46-443"><a href="#cb46-443" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-446"><a href="#cb46-446" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-447"><a href="#cb46-447" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-448"><a href="#cb46-448" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-449"><a href="#cb46-449" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-450"><a href="#cb46-450" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle, averageRating</span>
+<span id="cb46-451"><a href="#cb46-451" aria-hidden="true" tabindex="-1"></a>FROM Title T INNER JOIN Rating R</span>
+<span id="cb46-452"><a href="#cb46-452" aria-hidden="true" tabindex="-1"></a>ON T.tconst <span class="op">=</span> R.tconst<span class="op">;</span></span>
+<span id="cb46-453"><a href="#cb46-453" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb46-454"><a href="#cb46-454" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-455"><a href="#cb46-455" aria-hidden="true" tabindex="-1"></a><span class="fu">### Common Table Expressions</span></span>
+<span id="cb46-456"><a href="#cb46-456" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-457"><a href="#cb46-457" aria-hidden="true" tabindex="-1"></a>For more sophisticated data problems, the queries can become very complex. Common table expressions (CTEs) allow us to break down these complex queries into more manageable parts. To do so, we create temporary tables corresponding to different aspects of the problem and then reference them in the final query: </span>
+<span id="cb46-458"><a href="#cb46-458" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-459"><a href="#cb46-459" aria-hidden="true" tabindex="-1"></a><span class="in">    WITH </span></span>
+<span id="cb46-460"><a href="#cb46-460" aria-hidden="true" tabindex="-1"></a><span class="in">    table_name1 AS ( </span></span>
+<span id="cb46-461"><a href="#cb46-461" aria-hidden="true" tabindex="-1"></a><span class="in">        SELECT ...</span></span>
+<span id="cb46-462"><a href="#cb46-462" aria-hidden="true" tabindex="-1"></a><span class="in">    ),</span></span>
+<span id="cb46-463"><a href="#cb46-463" aria-hidden="true" tabindex="-1"></a><span class="in">    table_name2 AS ( </span></span>
+<span id="cb46-464"><a href="#cb46-464" aria-hidden="true" tabindex="-1"></a><span class="in">        SELECT ...</span></span>
+<span id="cb46-465"><a href="#cb46-465" aria-hidden="true" tabindex="-1"></a><span class="in">    )</span></span>
+<span id="cb46-466"><a href="#cb46-466" aria-hidden="true" tabindex="-1"></a><span class="in">    SELECT ... </span></span>
+<span id="cb46-467"><a href="#cb46-467" aria-hidden="true" tabindex="-1"></a><span class="in">    FROM </span></span>
+<span id="cb46-468"><a href="#cb46-468" aria-hidden="true" tabindex="-1"></a><span class="in">    table_name1, </span></span>
+<span id="cb46-469"><a href="#cb46-469" aria-hidden="true" tabindex="-1"></a><span class="in">    table_name2, ...</span></span>
+<span id="cb46-470"><a href="#cb46-470" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-471"><a href="#cb46-471" aria-hidden="true" tabindex="-1"></a>Let's say we want to identify the top 10 action movies that are highly rated (with an average rating greater than 7) and popular (having more than 5000 votes), along with the primary actors who are the most popular. We can use CTEs to break this query down into separate problems. Initially, we can filter to find good action movies and prolific actors separately. This way, in our final join, we only need to change the order.</span>
+<span id="cb46-472"><a href="#cb46-472" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb46-475"><a href="#cb46-475" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
+<span id="cb46-476"><a href="#cb46-476" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
+<span id="cb46-477"><a href="#cb46-477" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>sql</span>
+<span id="cb46-478"><a href="#cb46-478" aria-hidden="true" tabindex="-1"></a>WITH </span>
+<span id="cb46-479"><a href="#cb46-479" aria-hidden="true" tabindex="-1"></a>good_action_movies AS (</span>
+<span id="cb46-480"><a href="#cb46-480" aria-hidden="true" tabindex="-1"></a>    SELECT <span class="op">*</span></span>
+<span id="cb46-481"><a href="#cb46-481" aria-hidden="true" tabindex="-1"></a>    FROM Title T JOIN Rating R ON T.tconst <span class="op">=</span> R.tconst  </span>
+<span id="cb46-482"><a href="#cb46-482" aria-hidden="true" tabindex="-1"></a>    WHERE genres LIKE <span class="st">'%Action%'</span> AND averageRating <span class="op">&gt;</span> <span class="dv">7</span> AND numVotes <span class="op">&gt;</span> <span class="dv">5000</span></span>
+<span id="cb46-483"><a href="#cb46-483" aria-hidden="true" tabindex="-1"></a>),</span>
+<span id="cb46-484"><a href="#cb46-484" aria-hidden="true" tabindex="-1"></a>prolific_actors AS (</span>
+<span id="cb46-485"><a href="#cb46-485" aria-hidden="true" tabindex="-1"></a>    SELECT N.nconst, primaryName, COUNT(<span class="op">*</span>) <span class="im">as</span> numRoles</span>
+<span id="cb46-486"><a href="#cb46-486" aria-hidden="true" tabindex="-1"></a>    FROM Name N JOIN Principal P ON N.nconst <span class="op">=</span> P.nconst</span>
+<span id="cb46-487"><a href="#cb46-487" aria-hidden="true" tabindex="-1"></a>    WHERE category <span class="op">=</span> <span class="st">'actor'</span></span>
+<span id="cb46-488"><a href="#cb46-488" aria-hidden="true" tabindex="-1"></a>    GROUP BY N.nconst, primaryName</span>
+<span id="cb46-489"><a href="#cb46-489" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb46-490"><a href="#cb46-490" aria-hidden="true" tabindex="-1"></a>SELECT primaryTitle, primaryName, numRoles, ROUND(averageRating) AS rating</span>
+<span id="cb46-491"><a href="#cb46-491" aria-hidden="true" tabindex="-1"></a>FROM good_action_movies m, prolific_actors a, principal p</span>
+<span id="cb46-492"><a href="#cb46-492" aria-hidden="true" tabindex="-1"></a>WHERE p.tconst <span class="op">=</span> m.tconst AND p.nconst <span class="op">=</span> a.nconst</span>
+<span id="cb46-493"><a href="#cb46-493" aria-hidden="true" tabindex="-1"></a>ORDER BY rating DESC, numRoles DESC</span>
+<span id="cb46-494"><a href="#cb46-494" aria-hidden="true" tabindex="-1"></a>LIMIT <span class="dv">10</span><span class="op">;</span></span>
+<span id="cb46-495"><a href="#cb46-495" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+</code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/visualization_1/images/bad_distro.png b/docs/visualization_1/images/bad_distro.png
new file mode 100644
index 000000000..da18378e1
Binary files /dev/null and b/docs/visualization_1/images/bad_distro.png differ
diff --git a/docs/visualization_1/images/box_plot_diagram.png b/docs/visualization_1/images/box_plot_diagram.png
new file mode 100644
index 000000000..1da125972
Binary files /dev/null and b/docs/visualization_1/images/box_plot_diagram.png differ
diff --git a/docs/visualization_1/images/good_distro.png b/docs/visualization_1/images/good_distro.png
new file mode 100644
index 000000000..ee7be0663
Binary files /dev/null and b/docs/visualization_1/images/good_distro.png differ
diff --git a/docs/visualization_1/images/histogram_viz.png b/docs/visualization_1/images/histogram_viz.png
new file mode 100644
index 000000000..4a50ec4b9
Binary files /dev/null and b/docs/visualization_1/images/histogram_viz.png differ
diff --git a/docs/visualization_1/images/line_chart_viz.png b/docs/visualization_1/images/line_chart_viz.png
new file mode 100644
index 000000000..bbec9dc15
Binary files /dev/null and b/docs/visualization_1/images/line_chart_viz.png differ
diff --git a/docs/visualization_1/images/scatter.png b/docs/visualization_1/images/scatter.png
new file mode 100644
index 000000000..3ee8bb834
Binary files /dev/null and b/docs/visualization_1/images/scatter.png differ
diff --git a/docs/visualization_1/images/variable_types_vis_1.png b/docs/visualization_1/images/variable_types_vis_1.png
new file mode 100644
index 000000000..0409b3cf1
Binary files /dev/null and b/docs/visualization_1/images/variable_types_vis_1.png differ
diff --git a/docs/visualization_1/visualization_1.html b/docs/visualization_1/visualization_1.html
new file mode 100644
index 000000000..639fd0833
--- /dev/null
+++ b/docs/visualization_1/visualization_1.html
@@ -0,0 +1,1650 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>7&nbsp; Visualization I – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../visualization_2/visualization_2.html" rel="next">
+<link href="../regex/regex.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../visualization_1/visualization_1.html"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#visualizations-in-data-8-and-data-100-so-far" id="toc-visualizations-in-data-8-and-data-100-so-far" class="nav-link active" data-scroll-target="#visualizations-in-data-8-and-data-100-so-far"><span class="header-section-number">7.1</span> Visualizations in Data 8 and Data 100 (so far)</a></li>
+  <li><a href="#goals-of-visualization" id="toc-goals-of-visualization" class="nav-link" data-scroll-target="#goals-of-visualization"><span class="header-section-number">7.2</span> Goals of Visualization</a></li>
+  <li><a href="#an-overview-of-distributions" id="toc-an-overview-of-distributions" class="nav-link" data-scroll-target="#an-overview-of-distributions"><span class="header-section-number">7.3</span> An Overview of Distributions</a></li>
+  <li><a href="#variable-types-should-inform-plot-choice" id="toc-variable-types-should-inform-plot-choice" class="nav-link" data-scroll-target="#variable-types-should-inform-plot-choice"><span class="header-section-number">7.4</span> Variable Types Should Inform Plot Choice</a></li>
+  <li><a href="#qualitative-variables-bar-plots" id="toc-qualitative-variables-bar-plots" class="nav-link" data-scroll-target="#qualitative-variables-bar-plots"><span class="header-section-number">7.5</span> Qualitative Variables: Bar Plots</a>
+  <ul>
+  <li><a href="#plotting-in-pandas" id="toc-plotting-in-pandas" class="nav-link" data-scroll-target="#plotting-in-pandas"><span class="header-section-number">7.5.1</span> Plotting in Pandas</a></li>
+  <li><a href="#plotting-in-matplotlib" id="toc-plotting-in-matplotlib" class="nav-link" data-scroll-target="#plotting-in-matplotlib"><span class="header-section-number">7.5.2</span> Plotting in Matplotlib</a></li>
+  <li><a href="#plotting-in-seaborn" id="toc-plotting-in-seaborn" class="nav-link" data-scroll-target="#plotting-in-seaborn"><span class="header-section-number">7.5.3</span> Plotting in <code>Seaborn</code></a></li>
+  </ul></li>
+  <li><a href="#distributions-of-quantitative-variables" id="toc-distributions-of-quantitative-variables" class="nav-link" data-scroll-target="#distributions-of-quantitative-variables"><span class="header-section-number">7.6</span> Distributions of Quantitative Variables</a>
+  <ul>
+  <li><a href="#box-plots-and-violin-plots" id="toc-box-plots-and-violin-plots" class="nav-link" data-scroll-target="#box-plots-and-violin-plots"><span class="header-section-number">7.6.1</span> Box Plots and Violin Plots</a></li>
+  <li><a href="#side-by-side-box-and-violin-plots" id="toc-side-by-side-box-and-violin-plots" class="nav-link" data-scroll-target="#side-by-side-box-and-violin-plots"><span class="header-section-number">7.6.2</span> Side-by-Side Box and Violin Plots</a></li>
+  <li><a href="#histograms" id="toc-histograms" class="nav-link" data-scroll-target="#histograms"><span class="header-section-number">7.6.3</span> Histograms</a>
+  <ul>
+  <li><a href="#plotting-histograms" id="toc-plotting-histograms" class="nav-link" data-scroll-target="#plotting-histograms"><span class="header-section-number">7.6.3.1</span> Plotting Histograms</a></li>
+  <li><a href="#overlaid-histograms" id="toc-overlaid-histograms" class="nav-link" data-scroll-target="#overlaid-histograms"><span class="header-section-number">7.6.3.2</span> Overlaid Histograms</a></li>
+  <li><a href="#evaluating-histograms" id="toc-evaluating-histograms" class="nav-link" data-scroll-target="#evaluating-histograms"><span class="header-section-number">7.6.3.3</span> Evaluating Histograms</a>
+  <ul class="collapse">
+  <li><a href="#skewness-and-tails" id="toc-skewness-and-tails" class="nav-link" data-scroll-target="#skewness-and-tails"><span class="header-section-number">7.6.3.3.1</span> Skewness and Tails</a></li>
+  <li><a href="#outliers" id="toc-outliers" class="nav-link" data-scroll-target="#outliers"><span class="header-section-number">7.6.3.3.2</span> Outliers</a></li>
+  <li><a href="#modes" id="toc-modes" class="nav-link" data-scroll-target="#modes"><span class="header-section-number">7.6.3.3.3</span> Modes</a></li>
+  </ul></li>
+  </ul></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Understand the theories behind effective visualizations and start to generate plots of our own with <code>matplotlib</code> and <code>seaborn</code>.</li>
+<li>Analyze histograms and identify the skewness, potential outliers, and the mode.</li>
+<li>Use <code>boxplot</code> and <code>violinplot</code> to compare two distributions.</li>
+</ul>
+</div>
+</div>
+</div>
+<p>In our journey of the data science lifecycle, we have begun to explore the vast world of exploratory data analysis. More recently, we learned how to pre-process data using various data manipulation techniques. As we work towards understanding our data, there is one key component missing in our arsenal — the ability to visualize and discern relationships in existing data.</p>
+<p>These next two lectures will introduce you to various examples of data visualizations and their underlying theory. In doing so, we’ll motivate their importance in real-world examples with the use of plotting libraries.</p>
+<section id="visualizations-in-data-8-and-data-100-so-far" class="level2" data-number="7.1">
+<h2 data-number="7.1" class="anchored" data-anchor-id="visualizations-in-data-8-and-data-100-so-far"><span class="header-section-number">7.1</span> Visualizations in Data 8 and Data 100 (so far)</h2>
+<p>You’ve likely encountered several forms of data visualizations in your studies. You may remember two such examples from Data 8: line plots, scatter plots, and histograms. Each of these served a unique purpose. For example, line plots displayed how numerical quantities changed over time, while histograms were useful in understanding a variable’s distribution.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 50%">
+<col style="width: 50%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Line Chart</th>
+<th>Scatter Plot</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><img src="images/line_chart_viz.png" alt="bad_distro" width="300"></td>
+<td><img src="images/scatter.png" alt="bad_distro" width="300"></td>
+</tr>
+</tbody>
+</table>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th>Histogram</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><img src="images/histogram_viz.png" alt="bad_distro" width="400"></td>
+</tr>
+</tbody>
+</table>
+</section>
+<section id="goals-of-visualization" class="level2" data-number="7.2">
+<h2 data-number="7.2" class="anchored" data-anchor-id="goals-of-visualization"><span class="header-section-number">7.2</span> Goals of Visualization</h2>
+<p>Visualizations are useful for a number of reasons. In Data 100, we consider two areas in particular:</p>
+<ol type="1">
+<li>To broaden your understanding of the data. Summarizing trends visually before in-depth analysis is a key part of exploratory data analysis. Creating these graphs is a lightweight, iterative and flexible process that helps us investigate relationships between variables.</li>
+<li>To communicate results/conclusions to others. These visualizations are highly editorial, selective, and fine-tuned to achieve a communications goal, so be thoughtful and careful about its clarity, accessibility, and necessary context.</li>
+</ol>
+<p>Altogether, these goals emphasize the fact that visualizations aren’t a matter of making “pretty” pictures; we need to do a lot of thinking about what stylistic choices communicate ideas most effectively.</p>
+<p>This course note will focus on the first half of visualization topics in Data 100. The goal here is to understand how to choose the “right” plot depending on different variable types and, secondly, how to generate these plots using code.</p>
+</section>
+<section id="an-overview-of-distributions" class="level2" data-number="7.3">
+<h2 data-number="7.3" class="anchored" data-anchor-id="an-overview-of-distributions"><span class="header-section-number">7.3</span> An Overview of Distributions</h2>
+<p>A distribution describes both the set of values that a single variable can take and the frequency of unique values in a single variable. For example, if we’re interested in the distribution of students across Data 100 discussion sections, the set of possible values is a list of discussion sections (10-11am, 11-12pm, etc.), and the frequency that each of those values occur is the number of students enrolled in each section. In other words, the we’re interested in how a variable is distributed across it’s possible values. Therefore, distributions must satisfy two properties:</p>
+<ol type="1">
+<li>The total frequency of all categories must sum to 100%</li>
+<li>Total count should sum to the total number of datapoints if we’re using raw counts.</li>
+</ol>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 50%">
+<col style="width: 50%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Not a Valid Distribution</th>
+<th>Valid Distribution</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><img src="images/bad_distro.png" alt="bad_distro" width="350"></td>
+<td><img src="images/good_distro.png" alt="good_distro" width="300"></td>
+</tr>
+<tr class="even">
+<td>This is not a valid distribution since individuals can be associated with more than one category and the bar values demonstrate values in minutes and not probability.</td>
+<td>This example satisfies the two properties of distributions, so it is a valid distribution.</td>
+</tr>
+</tbody>
+</table>
+</section>
+<section id="variable-types-should-inform-plot-choice" class="level2" data-number="7.4">
+<h2 data-number="7.4" class="anchored" data-anchor-id="variable-types-should-inform-plot-choice"><span class="header-section-number">7.4</span> Variable Types Should Inform Plot Choice</h2>
+<p>Different plots are more or less suited for displaying particular types of variables, laid out in the diagram below:</p>
+<center>
+<img src="images/variable_types_vis_1.png" width="700">
+</center>
+<p>The first step of any visualization is to identify the type(s) of variables we’re working with. From here, we can select an appropriate plot type:</p>
+</section>
+<section id="qualitative-variables-bar-plots" class="level2" data-number="7.5">
+<h2 data-number="7.5" class="anchored" data-anchor-id="qualitative-variables-bar-plots"><span class="header-section-number">7.5</span> Qualitative Variables: Bar Plots</h2>
+<p>A <strong>bar plot</strong> is one of the most common ways of displaying the <strong>distribution</strong> of a <strong>qualitative</strong> (categorical) variable. The length of a bar plot encodes the frequency of a category; the width encodes no useful information. The color <em>could</em> indicate a sub-category, but this is not necessarily the case.</p>
+<p>Let’s contextualize this in an example. We will use the World Bank dataset (<code>wb</code>) in our analysis.</p>
+<div id="9c3ca0c6" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>wb <span class="op">=</span> pd.read_csv(<span class="st">"data/world_bank.csv"</span>, index_col<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>wb.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="1">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Continent</th>
+<th data-quarto-table-cell-role="th">Country</th>
+<th data-quarto-table-cell-role="th">Primary completion rate: Male: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Primary completion rate: Female: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Lower secondary completion rate: Male: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Lower secondary completion rate: Female: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Youth literacy rate: Male: % of ages 15-24: 2005-14</th>
+<th data-quarto-table-cell-role="th">Youth literacy rate: Female: % of ages 15-24: 2005-14</th>
+<th data-quarto-table-cell-role="th">Adult literacy rate: Male: % ages 15 and older: 2005-14</th>
+<th data-quarto-table-cell-role="th">Adult literacy rate: Female: % ages 15 and older: 2005-14</th>
+<th data-quarto-table-cell-role="th">...</th>
+<th data-quarto-table-cell-role="th">Access to improved sanitation facilities: % of population: 1990</th>
+<th data-quarto-table-cell-role="th">Access to improved sanitation facilities: % of population: 2015</th>
+<th data-quarto-table-cell-role="th">Child immunization rate: Measles: % of children ages 12-23 months: 2015</th>
+<th data-quarto-table-cell-role="th">Child immunization rate: DTP3: % of children ages 12-23 months: 2015</th>
+<th data-quarto-table-cell-role="th">Children with acute respiratory infection taken to health provider: % of children under age 5 with ARI: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Children with diarrhea who received oral rehydration and continuous feeding: % of children under age 5 with diarrhea: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Children sleeping under treated bed nets: % of children under age 5: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Children with fever receiving antimalarial drugs: % of children under age 5 with fever: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Tuberculosis: Treatment success rate: % of new cases: 2014</th>
+<th data-quarto-table-cell-role="th">Tuberculosis: Cases detection rate: % of new estimated cases: 2015</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Africa</td>
+<td>Algeria</td>
+<td>106.0</td>
+<td>105.0</td>
+<td>68.0</td>
+<td>85.0</td>
+<td>96.0</td>
+<td>92.0</td>
+<td>83.0</td>
+<td>68.0</td>
+<td>...</td>
+<td>80.0</td>
+<td>88.0</td>
+<td>95.0</td>
+<td>95.0</td>
+<td>66.0</td>
+<td>42.0</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>88.0</td>
+<td>80.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Africa</td>
+<td>Angola</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>79.0</td>
+<td>67.0</td>
+<td>82.0</td>
+<td>60.0</td>
+<td>...</td>
+<td>22.0</td>
+<td>52.0</td>
+<td>55.0</td>
+<td>64.0</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>25.9</td>
+<td>28.3</td>
+<td>34.0</td>
+<td>64.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Africa</td>
+<td>Benin</td>
+<td>83.0</td>
+<td>73.0</td>
+<td>50.0</td>
+<td>37.0</td>
+<td>55.0</td>
+<td>31.0</td>
+<td>41.0</td>
+<td>18.0</td>
+<td>...</td>
+<td>7.0</td>
+<td>20.0</td>
+<td>75.0</td>
+<td>79.0</td>
+<td>23.0</td>
+<td>33.0</td>
+<td>72.7</td>
+<td>25.9</td>
+<td>89.0</td>
+<td>61.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Africa</td>
+<td>Botswana</td>
+<td>98.0</td>
+<td>101.0</td>
+<td>86.0</td>
+<td>87.0</td>
+<td>96.0</td>
+<td>99.0</td>
+<td>87.0</td>
+<td>89.0</td>
+<td>...</td>
+<td>39.0</td>
+<td>63.0</td>
+<td>97.0</td>
+<td>95.0</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>77.0</td>
+<td>62.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">5</td>
+<td>Africa</td>
+<td>Burundi</td>
+<td>58.0</td>
+<td>66.0</td>
+<td>35.0</td>
+<td>30.0</td>
+<td>90.0</td>
+<td>88.0</td>
+<td>89.0</td>
+<td>85.0</td>
+<td>...</td>
+<td>42.0</td>
+<td>48.0</td>
+<td>93.0</td>
+<td>94.0</td>
+<td>55.0</td>
+<td>43.0</td>
+<td>53.8</td>
+<td>25.4</td>
+<td>91.0</td>
+<td>51.0</td>
+</tr>
+</tbody>
+</table>
+
+<p>5 rows × 47 columns</p>
+</div>
+</div>
+</div>
+<p>We can visualize the distribution of the <code>Continent</code> column using a bar plot. There are a few ways to do this.</p>
+<section id="plotting-in-pandas" class="level3" data-number="7.5.1">
+<h3 data-number="7.5.1" class="anchored" data-anchor-id="plotting-in-pandas"><span class="header-section-number">7.5.1</span> Plotting in Pandas</h3>
+<div id="66666f5a" class="cell" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>wb[<span class="st">'Continent'</span>].value_counts().plot(kind<span class="op">=</span><span class="st">'bar'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-3-output-1.png" width="566" height="490" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Recall that <code>.value_counts()</code> returns a <code>Series</code> with the total count of each unique value. We call <code>.plot(kind='bar')</code> on this result to visualize these counts as a bar plot.</p>
+<p>Plotting methods in <code>pandas</code> are the least preferred and not supported in Data 100, as their functionality is limited. Instead, future examples will focus on other libraries built specifically for visualizing data. The most well-known library here is <code>matplotlib</code>.</p>
+</section>
+<section id="plotting-in-matplotlib" class="level3" data-number="7.5.2">
+<h3 data-number="7.5.2" class="anchored" data-anchor-id="plotting-in-matplotlib"><span class="header-section-number">7.5.2</span> Plotting in Matplotlib</h3>
+<div id="2ea2aa7a" class="cell" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt <span class="co"># matplotlib is typically given the alias plt</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>continent <span class="op">=</span> wb[<span class="st">'Continent'</span>].value_counts()</span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>plt.bar(continent.index, continent)</span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'Continent'</span>)</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'Count'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-4-output-1.png" width="585" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>While more code is required to achieve the same result, <code>matplotlib</code> is often used over <code>pandas</code> for its ability to plot more complex visualizations, some of which are discussed shortly.</p>
+<p>However, note how we needed to label the axes with <code>plt.xlabel</code> and <code>plt.ylabel</code>, as <code>matplotlib</code> does not support automatic axis labeling. To get around these inconveniences, we can use a more efficient plotting library: <code>seaborn</code>.</p>
+</section>
+<section id="plotting-in-seaborn" class="level3" data-number="7.5.3">
+<h3 data-number="7.5.3" class="anchored" data-anchor-id="plotting-in-seaborn"><span class="header-section-number">7.5.3</span> Plotting in <code>Seaborn</code></h3>
+<div id="859c643a" class="cell" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns <span class="co"># seaborn is typically given the alias sns</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>sns.countplot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">'Continent'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-5-output-1.png" width="585" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>In contrast to <code>matplotlib</code>, the general structure of a <code>seaborn</code> call involves passing in an entire <code>DataFrame</code>, and then specifying what column(s) to plot. <code>seaborn.countplot</code> both counts and visualizes the number of unique values in a given column. This column is specified by the <code>x</code> argument to <code>sns.countplot</code>, while the <code>DataFrame</code> is specified by the <code>data</code> argument.</p>
+<p>For the vast majority of visualizations, <code>seaborn</code> is far more concise and aesthetically pleasing than <code>matplotlib</code>. However, the color scheme of this particular bar plot is arbitrary - it encodes no additional information about the categories themselves. This is not always true; color may signify meaningful detail in other visualizations. We’ll explore this more in-depth during the next lecture.</p>
+<p>By now, you’ll have noticed that each of these plotting libraries have a very different syntax. As with <code>pandas</code>, we’ll teach you the important methods in <code>matplotlib</code> and <code>seaborn</code>, but you’ll learn more through documentation.</p>
+<ol type="1">
+<li><a href="https://matplotlib.org/stable/index.html">Matplotlib Documentation</a></li>
+<li><a href="https://seaborn.pydata.org/">Seaborn Documentation</a></li>
+</ol>
+</section>
+</section>
+<section id="distributions-of-quantitative-variables" class="level2" data-number="7.6">
+<h2 data-number="7.6" class="anchored" data-anchor-id="distributions-of-quantitative-variables"><span class="header-section-number">7.6</span> Distributions of Quantitative Variables</h2>
+<p>Revisiting our example with the <code>wb</code> DataFrame, let’s plot the distribution of <code>Gross national income per capita</code>.</p>
+<div id="663617f5" class="cell" data-execution_count="5">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>wb.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="5">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Continent</th>
+<th data-quarto-table-cell-role="th">Country</th>
+<th data-quarto-table-cell-role="th">Primary completion rate: Male: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Primary completion rate: Female: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Lower secondary completion rate: Male: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Lower secondary completion rate: Female: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Youth literacy rate: Male: % of ages 15-24: 2005-14</th>
+<th data-quarto-table-cell-role="th">Youth literacy rate: Female: % of ages 15-24: 2005-14</th>
+<th data-quarto-table-cell-role="th">Adult literacy rate: Male: % ages 15 and older: 2005-14</th>
+<th data-quarto-table-cell-role="th">Adult literacy rate: Female: % ages 15 and older: 2005-14</th>
+<th data-quarto-table-cell-role="th">...</th>
+<th data-quarto-table-cell-role="th">Access to improved sanitation facilities: % of population: 1990</th>
+<th data-quarto-table-cell-role="th">Access to improved sanitation facilities: % of population: 2015</th>
+<th data-quarto-table-cell-role="th">Child immunization rate: Measles: % of children ages 12-23 months: 2015</th>
+<th data-quarto-table-cell-role="th">Child immunization rate: DTP3: % of children ages 12-23 months: 2015</th>
+<th data-quarto-table-cell-role="th">Children with acute respiratory infection taken to health provider: % of children under age 5 with ARI: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Children with diarrhea who received oral rehydration and continuous feeding: % of children under age 5 with diarrhea: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Children sleeping under treated bed nets: % of children under age 5: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Children with fever receiving antimalarial drugs: % of children under age 5 with fever: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Tuberculosis: Treatment success rate: % of new cases: 2014</th>
+<th data-quarto-table-cell-role="th">Tuberculosis: Cases detection rate: % of new estimated cases: 2015</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Africa</td>
+<td>Algeria</td>
+<td>106.0</td>
+<td>105.0</td>
+<td>68.0</td>
+<td>85.0</td>
+<td>96.0</td>
+<td>92.0</td>
+<td>83.0</td>
+<td>68.0</td>
+<td>...</td>
+<td>80.0</td>
+<td>88.0</td>
+<td>95.0</td>
+<td>95.0</td>
+<td>66.0</td>
+<td>42.0</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>88.0</td>
+<td>80.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Africa</td>
+<td>Angola</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>79.0</td>
+<td>67.0</td>
+<td>82.0</td>
+<td>60.0</td>
+<td>...</td>
+<td>22.0</td>
+<td>52.0</td>
+<td>55.0</td>
+<td>64.0</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>25.9</td>
+<td>28.3</td>
+<td>34.0</td>
+<td>64.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Africa</td>
+<td>Benin</td>
+<td>83.0</td>
+<td>73.0</td>
+<td>50.0</td>
+<td>37.0</td>
+<td>55.0</td>
+<td>31.0</td>
+<td>41.0</td>
+<td>18.0</td>
+<td>...</td>
+<td>7.0</td>
+<td>20.0</td>
+<td>75.0</td>
+<td>79.0</td>
+<td>23.0</td>
+<td>33.0</td>
+<td>72.7</td>
+<td>25.9</td>
+<td>89.0</td>
+<td>61.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Africa</td>
+<td>Botswana</td>
+<td>98.0</td>
+<td>101.0</td>
+<td>86.0</td>
+<td>87.0</td>
+<td>96.0</td>
+<td>99.0</td>
+<td>87.0</td>
+<td>89.0</td>
+<td>...</td>
+<td>39.0</td>
+<td>63.0</td>
+<td>97.0</td>
+<td>95.0</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>77.0</td>
+<td>62.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">5</td>
+<td>Africa</td>
+<td>Burundi</td>
+<td>58.0</td>
+<td>66.0</td>
+<td>35.0</td>
+<td>30.0</td>
+<td>90.0</td>
+<td>88.0</td>
+<td>89.0</td>
+<td>85.0</td>
+<td>...</td>
+<td>42.0</td>
+<td>48.0</td>
+<td>93.0</td>
+<td>94.0</td>
+<td>55.0</td>
+<td>43.0</td>
+<td>53.8</td>
+<td>25.4</td>
+<td>91.0</td>
+<td>51.0</td>
+</tr>
+</tbody>
+</table>
+
+<p>5 rows × 47 columns</p>
+</div>
+</div>
+</div>
+<p>How should we define our categories for this variable? In the previous example, these were a few unique values of the <code>Continent</code> column. If we use similar logic here, our categories are the different numerical values contained in the <code>Gross national income per capita</code> column.</p>
+<p>Under this assumption, let’s plot this distribution using the <code>seaborn.countplot</code> function.</p>
+<div id="e1628aa0" class="cell" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>sns.countplot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">'Gross national income per capita, Atlas method: $: 2016'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-7-output-1.png" width="615" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>What happened? A bar plot (either <code>plt.bar</code> or <code>sns.countplot</code>) will create a separate bar for each unique value of a variable. With a continuous variable, we may not have a finite number of possible values, which can lead to situations like above where we would need many, many bars to display each unique value.</p>
+<p>Specifically, we can say this histogram suffers from <strong>overplotting</strong> as we are unable to interpret the plot and gain any meaningful insight.</p>
+<p>Rather than bar plots, to visualize the distribution of a continuous variable, we use one of the following types of plots:</p>
+<ul>
+<li>Histogram</li>
+<li>Box plot</li>
+<li>Violin plot</li>
+</ul>
+<section id="box-plots-and-violin-plots" class="level3" data-number="7.6.1">
+<h3 data-number="7.6.1" class="anchored" data-anchor-id="box-plots-and-violin-plots"><span class="header-section-number">7.6.1</span> Box Plots and Violin Plots</h3>
+<p>Box plots and violin plots are two very similar kinds of visualizations. Both display the distribution of a variable using information about <strong>quartiles</strong>.</p>
+<p>In a box plot, the width of the box at any point does not encode meaning. In a violin plot, the width of the plot indicates the density of the distribution at each possible value.</p>
+<div id="cd4ca3fc" class="cell" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>sns.boxplot(data<span class="op">=</span>wb, y<span class="op">=</span><span class="st">'Gross national income per capita, Atlas method: $: 2016'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-8-output-1.png" width="610" height="393" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<div id="d8d18453" class="cell" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>sns.violinplot(data<span class="op">=</span>wb, y<span class="op">=</span><span class="st">"Gross national income per capita, Atlas method: $: 2016"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-9-output-1.png" width="618" height="396" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>A quartile represents a 25% portion of the data. We say that:</p>
+<ul>
+<li>The first quartile (Q1) represents the 25th percentile – 25% of the data is smaller than or equal to the first quartile.</li>
+<li>The second quartile (Q2) represents the 50th percentile, also known as the median – 50% of the data is smaller than or equal to the second quartile.</li>
+<li>The third quartile (Q3) represents the 75th percentile – 75% of the data is smaller than or equal to the third quartile.</li>
+</ul>
+<p>This means that the middle 50% of the data lies between the first and third quartiles. This is demonstrated in the histogram below. The three quartiles are marked with red vertical bars.</p>
+<div id="5b179e6a" class="cell" data-execution_count="9">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>gdp <span class="op">=</span> wb[<span class="st">'Gross domestic product: </span><span class="sc">% g</span><span class="st">rowth : 2016'</span>]</span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>gdp <span class="op">=</span> gdp[<span class="op">~</span>gdp.isna()]</span>
+<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>q1, q2, q3 <span class="op">=</span> np.percentile(gdp, [<span class="dv">25</span>, <span class="dv">50</span>, <span class="dv">75</span>])</span>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>wb_quartiles <span class="op">=</span> wb.copy()</span>
+<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>wb_quartiles[<span class="st">'category'</span>] <span class="op">=</span> <span class="va">None</span></span>
+<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a>wb_quartiles.loc[(wb_quartiles[<span class="st">'Gross domestic product: </span><span class="sc">% g</span><span class="st">rowth : 2016'</span>] <span class="op">&lt;</span> q1) <span class="op">|</span> (wb_quartiles[<span class="st">'Gross domestic product: </span><span class="sc">% g</span><span class="st">rowth : 2016'</span>] <span class="op">&gt;</span> q3), <span class="st">'category'</span>] <span class="op">=</span> <span class="st">'Outside of the middle 50%'</span></span>
+<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>wb_quartiles.loc[(wb_quartiles[<span class="st">'Gross domestic product: </span><span class="sc">% g</span><span class="st">rowth : 2016'</span>] <span class="op">&gt;</span> q1) <span class="op">&amp;</span> (wb_quartiles[<span class="st">'Gross domestic product: </span><span class="sc">% g</span><span class="st">rowth : 2016'</span>] <span class="op">&lt;</span> q3), <span class="st">'category'</span>] <span class="op">=</span> <span class="st">'In the middle 50%'</span></span>
+<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a>sns.histplot(wb_quartiles, x<span class="op">=</span><span class="st">"Gross domestic product: </span><span class="sc">% g</span><span class="st">rowth : 2016"</span>, hue<span class="op">=</span><span class="st">"category"</span>)</span>
+<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a>sns.rugplot([q1, q2, q3], c<span class="op">=</span><span class="st">"firebrick"</span>, lw<span class="op">=</span><span class="dv">6</span>, height<span class="op">=</span><span class="fl">0.1</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-10-output-1.png" width="585" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>In a box plot, the lower extent of the box lies at Q1, while the upper extent of the box lies at Q3. The horizontal line in the middle of the box corresponds to Q2 (equivalently, the median).</p>
+<div id="585e7e01" class="cell" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>sns.boxplot(data<span class="op">=</span>wb, y<span class="op">=</span><span class="st">'Gross domestic product: </span><span class="sc">% g</span><span class="st">rowth : 2016'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-11-output-1.png" width="596" height="393" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>The <strong>whiskers</strong> of a box-plot are the two points that lie at the [<span class="math inline">\(1^{st}\)</span> Quartile <span class="math inline">\(-\)</span> (<span class="math inline">\(1.5\times\)</span> IQR)], and the [<span class="math inline">\(3^{rd}\)</span> Quartile <span class="math inline">\(+\)</span> (<span class="math inline">\(1.5\times\)</span> IQR)]. They are the lower and upper ranges of “normal” data (the points excluding outliers).</p>
+<p>The different forms of information contained in a box plot can be summarised as follows:</p>
+<center>
+<img src="images/box_plot_diagram.png" width="600">
+</center>
+<p>A violin plot displays quartile information, albeit a bit more subtly through smoothed density curves. Look closely at the center vertical bar of the violin plot below; the three quartiles and “whiskers” are still present!</p>
+<div id="eae73c37" class="cell" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>sns.violinplot(data<span class="op">=</span>wb, y<span class="op">=</span><span class="st">'Gross domestic product: </span><span class="sc">% g</span><span class="st">rowth : 2016'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-12-output-1.png" width="596" height="393" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="side-by-side-box-and-violin-plots" class="level3" data-number="7.6.2">
+<h3 data-number="7.6.2" class="anchored" data-anchor-id="side-by-side-box-and-violin-plots"><span class="header-section-number">7.6.2</span> Side-by-Side Box and Violin Plots</h3>
+<p>Plotting side-by-side box or violin plots allows us to compare distributions across different categories. In other words, they enable us to plot both a qualitative variable and a quantitative continuous variable in one visualization.</p>
+<p>With <code>seaborn</code>, we can easily create side-by-side plots by specifying both an x and y column.</p>
+<div id="d9d3db40" class="cell" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>sns.boxplot(data<span class="op">=</span>wb, x<span class="op">=</span><span class="st">"Continent"</span>, y<span class="op">=</span><span class="st">'Gross domestic product: </span><span class="sc">% g</span><span class="st">rowth : 2016'</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-13-output-1.png" width="596" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="histograms" class="level3" data-number="7.6.3">
+<h3 data-number="7.6.3" class="anchored" data-anchor-id="histograms"><span class="header-section-number">7.6.3</span> Histograms</h3>
+<p>You are likely familiar with histograms from Data 8. A histogram collects continuous data into bins, then plots this binned data. Each bin reflects the density of datapoints with values that lie between the left and right ends of the bin; in other words, the <strong>area</strong> of each bin is proportional to the <strong>percentage</strong> of datapoints it contains.</p>
+<section id="plotting-histograms" class="level4" data-number="7.6.3.1">
+<h4 data-number="7.6.3.1" class="anchored" data-anchor-id="plotting-histograms"><span class="header-section-number">7.6.3.1</span> Plotting Histograms</h4>
+<p>Below, we plot a histogram using matplotlib and seaborn. Which graph do you prefer?</p>
+<div id="5398e34e" class="cell" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co"># The `edgecolor` argument controls the color of the bin edges</span></span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>gni <span class="op">=</span> wb[<span class="st">"Gross national income per capita, Atlas method: $: 2016"</span>]</span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>plt.hist(gni, density<span class="op">=</span><span class="va">True</span>, edgecolor<span class="op">=</span><span class="st">"white"</span>)</span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Add labels</span></span>
+<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Gross national income per capita"</span>)</span>
+<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Density"</span>)</span>
+<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Distribution of gross national income per capita"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-14-output-1.png" width="576" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<div id="c287989b" class="cell" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>sns.histplot(data<span class="op">=</span>wb, x<span class="op">=</span><span class="st">"Gross national income per capita, Atlas method: $: 2016"</span>, stat<span class="op">=</span><span class="st">"density"</span>)</span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Distribution of gross national income per capita"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-15-output-1.png" width="623" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="overlaid-histograms" class="level4" data-number="7.6.3.2">
+<h4 data-number="7.6.3.2" class="anchored" data-anchor-id="overlaid-histograms"><span class="header-section-number">7.6.3.2</span> Overlaid Histograms</h4>
+<p>We can overlay histograms (or density curves) to compare distributions across qualitative categories.</p>
+<p>The <code>hue</code> parameter of <code>sns.histplot</code> specifies the column that should be used to determine the color of each category. <code>hue</code> can be used in many <code>seaborn</code> plotting functions.</p>
+<p>Notice that the resulting plot includes a legend describing which color corresponds to each hemisphere – a legend should always be included if color is used to encode information in a visualization!</p>
+<div id="e3405715" class="cell" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new variable to store the hemisphere in which each country is located</span></span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>north <span class="op">=</span> [<span class="st">"Asia"</span>, <span class="st">"Europe"</span>, <span class="st">"N. America"</span>]</span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>south <span class="op">=</span> [<span class="st">"Africa"</span>, <span class="st">"Oceania"</span>, <span class="st">"S. America"</span>]</span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>wb.loc[wb[<span class="st">"Continent"</span>].isin(north), <span class="st">"Hemisphere"</span>] <span class="op">=</span> <span class="st">"Northern"</span></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>wb.loc[wb[<span class="st">"Continent"</span>].isin(south), <span class="st">"Hemisphere"</span>] <span class="op">=</span> <span class="st">"Southern"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<div id="bf235661" class="cell" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>sns.histplot(data<span class="op">=</span>wb, x<span class="op">=</span><span class="st">"Gross national income per capita, Atlas method: $: 2016"</span>, hue<span class="op">=</span><span class="st">"Hemisphere"</span>, stat<span class="op">=</span><span class="st">"density"</span>)</span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Distribution of gross national income per capita"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-17-output-1.png" width="576" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Again, each bin of a histogram is scaled such that its <strong>area</strong> is proportional to the <strong>percentage</strong> of all datapoints that it contains.</p>
+<div id="4fd17876" class="cell" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>densities, bins, _ <span class="op">=</span> plt.hist(gni, density<span class="op">=</span><span class="va">True</span>, edgecolor<span class="op">=</span><span class="st">"white"</span>, bins<span class="op">=</span><span class="dv">5</span>)</span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Gross national income per capita"</span>)</span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Density"</span>)</span>
+<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"First bin has width </span><span class="sc">{</span>bins[<span class="dv">1</span>]<span class="op">-</span>bins[<span class="dv">0</span>]<span class="sc">}</span><span class="ss"> and height </span><span class="sc">{</span>densities[<span class="dv">0</span>]<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"This corresponds to </span><span class="sc">{</span>bins[<span class="dv">1</span>]<span class="op">-</span>bins[<span class="dv">0</span>]<span class="sc">}</span><span class="ss"> * </span><span class="sc">{</span>densities[<span class="dv">0</span>]<span class="sc">}</span><span class="ss"> = </span><span class="sc">{</span>(bins[<span class="dv">1</span>]<span class="op">-</span>bins[<span class="dv">0</span>])<span class="op">*</span>densities[<span class="dv">0</span>]<span class="op">*</span><span class="dv">100</span><span class="sc">}</span><span class="ss">% of the data"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stdout">
+<pre><code>First bin has width 16410.0 and height 4.7741589911386953e-05
+This corresponds to 16410.0 * 4.7741589911386953e-05 = 78.343949044586% of the data</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-18-output-2.png" width="576" height="443" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="evaluating-histograms" class="level4" data-number="7.6.3.3">
+<h4 data-number="7.6.3.3" class="anchored" data-anchor-id="evaluating-histograms"><span class="header-section-number">7.6.3.3</span> Evaluating Histograms</h4>
+<p>Histograms allow us to assess a distribution by their shape. There are a few properties of histograms we can analyze:</p>
+<ol type="1">
+<li>Skewness and Tails
+<ul>
+<li>Skewed left vs skewed right</li>
+<li>Left tail vs right tail</li>
+</ul></li>
+<li>Outliers
+<ul>
+<li>Using percentiles</li>
+</ul></li>
+<li>Modes
+<ul>
+<li>Most commonly occuring data</li>
+</ul></li>
+</ol>
+<section id="skewness-and-tails" class="level5" data-number="7.6.3.3.1">
+<h5 data-number="7.6.3.3.1" class="anchored" data-anchor-id="skewness-and-tails"><span class="header-section-number">7.6.3.3.1</span> Skewness and Tails</h5>
+<p>The skew of a histogram describes the direction in which its “tail” extends. - A distribution with a long right tail is <strong>skewed right</strong> (such as <code>Gross national income per capita</code>). In a right-skewed distribution, the few large outliers “pull” the mean to the <strong>right</strong> of the median.</p>
+<div id="aaa1e126" class="cell" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>sns.histplot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">'Gross national income per capita, Atlas method: $: 2016'</span>, stat <span class="op">=</span> <span class="st">'density'</span>)<span class="op">;</span></span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">'Distribution with a long right tail'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="18">
+<pre><code>Text(0.5, 1.0, 'Distribution with a long right tail')</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-19-output-2.png" width="623" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<ul>
+<li>A distribution with a long left tail is <strong>skewed left</strong> (such as <code>Access to an improved water source</code>). In a left-skewed distribution, the few small outliers “pull” the mean to the <strong>left</strong> of the median.</li>
+</ul>
+<p>In the case where a distribution has equal-sized right and left tails, it is <strong>symmetric</strong>. The mean is approximately <strong>equal</strong> to the median. Think of mean as the balancing point of the distribution.</p>
+<div id="3fa2b003" class="cell" data-execution_count="19">
+<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>sns.histplot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">'Access to an improved water source: </span><span class="sc">% o</span><span class="st">f population: 2015'</span>, stat <span class="op">=</span> <span class="st">'density'</span>)<span class="op">;</span></span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">'Distribution with a long left tail'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="19">
+<pre><code>Text(0.5, 1.0, 'Distribution with a long left tail')</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-20-output-2.png" width="597" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="outliers" class="level5" data-number="7.6.3.3.2">
+<h5 data-number="7.6.3.3.2" class="anchored" data-anchor-id="outliers"><span class="header-section-number">7.6.3.3.2</span> Outliers</h5>
+<p>Loosely speaking, an <strong>outlier</strong> is defined as a data point that lies an abnormally large distance away from other values. Let’s make this more concrete. As you may have observed in the box plot infographic earlier, we define <strong>outliers</strong> to be the data points that fall beyond the whiskers. Specifically, values that are less than the [<span class="math inline">\(1^{st}\)</span> Quartile <span class="math inline">\(-\)</span> (<span class="math inline">\(1.5\times\)</span> IQR)], or greater than [<span class="math inline">\(3^{rd}\)</span> Quartile <span class="math inline">\(+\)</span> (<span class="math inline">\(1.5\times\)</span> IQR).]</p>
+</section>
+<section id="modes" class="level5" data-number="7.6.3.3.3">
+<h5 data-number="7.6.3.3.3" class="anchored" data-anchor-id="modes"><span class="header-section-number">7.6.3.3.3</span> Modes</h5>
+<p>In Data 100, we describe a “mode” of a histogram as a peak in the distribution. Often, however, it is difficult to determine what counts as its own “peak.” For example, the number of peaks in the distribution of HIV rates across different countries varies depending on the number of histogram bins we plot.</p>
+<p>If we set the number of bins to 5, the distribution appears unimodal.</p>
+<div id="8a11e86f" class="cell" data-execution_count="20">
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Rename the very long column name for convenience</span></span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>wb <span class="op">=</span> wb.rename(columns<span class="op">=</span>{<span class="st">'Antiretroviral therapy coverage: </span><span class="sc">% o</span><span class="st">f people living with HIV: 2015'</span>:<span class="st">"HIV rate"</span>})</span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a><span class="co"># With 5 bins, it seems that there is only one peak</span></span>
+<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>sns.histplot(data<span class="op">=</span>wb, x<span class="op">=</span><span class="st">"HIV rate"</span>, stat<span class="op">=</span><span class="st">"density"</span>, bins<span class="op">=</span><span class="dv">5</span>)</span>
+<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"5 histogram bins"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-21-output-1.png" width="614" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<div id="49dd3ec9" class="cell" data-execution_count="21">
+<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># With 10 bins, there seem to be two peaks</span></span>
+<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>sns.histplot(data<span class="op">=</span>wb, x<span class="op">=</span><span class="st">"HIV rate"</span>, stat<span class="op">=</span><span class="st">"density"</span>, bins<span class="op">=</span><span class="dv">10</span>)</span>
+<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"10 histogram bins"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-22-output-1.png" width="606" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<div id="a5e15ac3" class="cell" data-execution_count="22">
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="co"># And with 20 bins, it becomes hard to say what counts as a "peak"!</span></span>
+<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a>sns.histplot(data<span class="op">=</span>wb, x <span class="op">=</span><span class="st">"HIV rate"</span>, stat<span class="op">=</span><span class="st">"density"</span>, bins<span class="op">=</span><span class="dv">20</span>)</span>
+<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"20 histogram bins"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_1_files/figure-html/cell-23-output-1.png" width="606" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>In part, it is these ambiguities that motivate us to consider using Kernel Density Estimation (KDE), which we will explore more in the next lecture.</p>
+
+
+</section>
+</section>
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../regex/regex.html" class="pagination-link" aria-label="Regular Expressions">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../visualization_2/visualization_2.html" class="pagination-link" aria-label="Visualization II">
+        <span class="nav-page-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-10-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-10-output-1.png
new file mode 100644
index 000000000..149c1a50c
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-10-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-11-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-11-output-1.png
new file mode 100644
index 000000000..782a8892e
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-11-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-12-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-12-output-1.png
new file mode 100644
index 000000000..c49423440
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-12-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-13-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-13-output-1.png
new file mode 100644
index 000000000..2ccb2f586
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-13-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-14-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-14-output-1.png
new file mode 100644
index 000000000..972ec6499
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-14-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-15-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-15-output-1.png
new file mode 100644
index 000000000..f7233df13
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-15-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-17-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-17-output-1.png
new file mode 100644
index 000000000..4eabe260e
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-17-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-18-output-2.png b/docs/visualization_1/visualization_1_files/figure-html/cell-18-output-2.png
new file mode 100644
index 000000000..0eca3fb68
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-18-output-2.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-19-output-2.png b/docs/visualization_1/visualization_1_files/figure-html/cell-19-output-2.png
new file mode 100644
index 000000000..ff2e07709
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-19-output-2.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-20-output-2.png b/docs/visualization_1/visualization_1_files/figure-html/cell-20-output-2.png
new file mode 100644
index 000000000..77c3d34a7
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-20-output-2.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-21-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-21-output-1.png
new file mode 100644
index 000000000..c9b1611d2
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-21-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-22-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-22-output-1.png
new file mode 100644
index 000000000..13b3e0ea1
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-22-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-23-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-23-output-1.png
new file mode 100644
index 000000000..eefabddfe
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-23-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-3-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-3-output-1.png
new file mode 100644
index 000000000..7ce5b2df4
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-3-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-4-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-4-output-1.png
new file mode 100644
index 000000000..d33747cc6
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-4-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-5-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-5-output-1.png
new file mode 100644
index 000000000..5d6f07ab3
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-5-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-7-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-7-output-1.png
new file mode 100644
index 000000000..27dc887de
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-7-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-8-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-8-output-1.png
new file mode 100644
index 000000000..f21ef5460
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-8-output-1.png differ
diff --git a/docs/visualization_1/visualization_1_files/figure-html/cell-9-output-1.png b/docs/visualization_1/visualization_1_files/figure-html/cell-9-output-1.png
new file mode 100644
index 000000000..5af52bc2b
Binary files /dev/null and b/docs/visualization_1/visualization_1_files/figure-html/cell-9-output-1.png differ
diff --git a/docs/visualization_2/images/boxcar_kernel.png b/docs/visualization_2/images/boxcar_kernel.png
new file mode 100644
index 000000000..8d652b1e6
Binary files /dev/null and b/docs/visualization_2/images/boxcar_kernel.png differ
diff --git a/docs/visualization_2/images/bulge.png b/docs/visualization_2/images/bulge.png
new file mode 100644
index 000000000..304f40f14
Binary files /dev/null and b/docs/visualization_2/images/bulge.png differ
diff --git a/docs/visualization_2/images/gaussian_0.1.png b/docs/visualization_2/images/gaussian_0.1.png
new file mode 100644
index 000000000..5a71d3cc5
Binary files /dev/null and b/docs/visualization_2/images/gaussian_0.1.png differ
diff --git a/docs/visualization_2/images/gaussian_1.png b/docs/visualization_2/images/gaussian_1.png
new file mode 100644
index 000000000..e51846be2
Binary files /dev/null and b/docs/visualization_2/images/gaussian_1.png differ
diff --git a/docs/visualization_2/images/gaussian_10.png b/docs/visualization_2/images/gaussian_10.png
new file mode 100644
index 000000000..45d1974d3
Binary files /dev/null and b/docs/visualization_2/images/gaussian_10.png differ
diff --git a/docs/visualization_2/images/gaussian_2.png b/docs/visualization_2/images/gaussian_2.png
new file mode 100644
index 000000000..6357afff5
Binary files /dev/null and b/docs/visualization_2/images/gaussian_2.png differ
diff --git a/docs/visualization_2/images/gaussian_kernel.png b/docs/visualization_2/images/gaussian_kernel.png
new file mode 100644
index 000000000..8be7f2dcd
Binary files /dev/null and b/docs/visualization_2/images/gaussian_kernel.png differ
diff --git a/docs/visualization_2/images/good_viz_scale_1.png b/docs/visualization_2/images/good_viz_scale_1.png
new file mode 100644
index 000000000..4576b61e1
Binary files /dev/null and b/docs/visualization_2/images/good_viz_scale_1.png differ
diff --git a/docs/visualization_2/images/good_viz_scale_2.png b/docs/visualization_2/images/good_viz_scale_2.png
new file mode 100644
index 000000000..ccbda9388
Binary files /dev/null and b/docs/visualization_2/images/good_viz_scale_2.png differ
diff --git a/docs/visualization_2/images/horizontal.png b/docs/visualization_2/images/horizontal.png
new file mode 100644
index 000000000..afcfa4856
Binary files /dev/null and b/docs/visualization_2/images/horizontal.png differ
diff --git a/docs/visualization_2/images/jet_3_images.png b/docs/visualization_2/images/jet_3_images.png
new file mode 100644
index 000000000..1067c77c7
Binary files /dev/null and b/docs/visualization_2/images/jet_3_images.png differ
diff --git a/docs/visualization_2/images/jet_colormap.png b/docs/visualization_2/images/jet_colormap.png
new file mode 100644
index 000000000..93d07c106
Binary files /dev/null and b/docs/visualization_2/images/jet_colormap.png differ
diff --git a/docs/visualization_2/images/jet_four_by_four.png b/docs/visualization_2/images/jet_four_by_four.png
new file mode 100644
index 000000000..a46062b04
Binary files /dev/null and b/docs/visualization_2/images/jet_four_by_four.png differ
diff --git a/docs/visualization_2/images/jet_perceptually_uniform.png b/docs/visualization_2/images/jet_perceptually_uniform.png
new file mode 100644
index 000000000..b0490ed8f
Binary files /dev/null and b/docs/visualization_2/images/jet_perceptually_uniform.png differ
diff --git a/docs/visualization_2/images/kde_function.png b/docs/visualization_2/images/kde_function.png
new file mode 100644
index 000000000..392f8656a
Binary files /dev/null and b/docs/visualization_2/images/kde_function.png differ
diff --git a/docs/visualization_2/images/linearize.png b/docs/visualization_2/images/linearize.png
new file mode 100644
index 000000000..14eec3a92
Binary files /dev/null and b/docs/visualization_2/images/linearize.png differ
diff --git a/docs/visualization_2/images/male_female_earnings_barplot.png b/docs/visualization_2/images/male_female_earnings_barplot.png
new file mode 100644
index 000000000..425ceb383
Binary files /dev/null and b/docs/visualization_2/images/male_female_earnings_barplot.png differ
diff --git a/docs/visualization_2/images/male_female_earnings_scatterplot.png b/docs/visualization_2/images/male_female_earnings_scatterplot.png
new file mode 100644
index 000000000..827631a08
Binary files /dev/null and b/docs/visualization_2/images/male_female_earnings_scatterplot.png differ
diff --git a/docs/visualization_2/images/markings_viz.png b/docs/visualization_2/images/markings_viz.png
new file mode 100644
index 000000000..a68e77643
Binary files /dev/null and b/docs/visualization_2/images/markings_viz.png differ
diff --git a/docs/visualization_2/images/mutli_dim_encodings.png b/docs/visualization_2/images/mutli_dim_encodings.png
new file mode 100644
index 000000000..67ede5ee6
Binary files /dev/null and b/docs/visualization_2/images/mutli_dim_encodings.png differ
diff --git a/docs/visualization_2/images/revealed_viz.png b/docs/visualization_2/images/revealed_viz.png
new file mode 100644
index 000000000..a5cbf2d83
Binary files /dev/null and b/docs/visualization_2/images/revealed_viz.png differ
diff --git a/docs/visualization_2/images/rugplot_encoding.png b/docs/visualization_2/images/rugplot_encoding.png
new file mode 100644
index 000000000..e568644eb
Binary files /dev/null and b/docs/visualization_2/images/rugplot_encoding.png differ
diff --git a/docs/visualization_2/images/small_multiples.png b/docs/visualization_2/images/small_multiples.png
new file mode 100644
index 000000000..d624de378
Binary files /dev/null and b/docs/visualization_2/images/small_multiples.png differ
diff --git a/docs/visualization_2/images/tukey_mosteller.png b/docs/visualization_2/images/tukey_mosteller.png
new file mode 100644
index 000000000..6c322a019
Binary files /dev/null and b/docs/visualization_2/images/tukey_mosteller.png differ
diff --git a/docs/visualization_2/images/unrevealed_viz.png b/docs/visualization_2/images/unrevealed_viz.png
new file mode 100644
index 000000000..f371ed74d
Binary files /dev/null and b/docs/visualization_2/images/unrevealed_viz.png differ
diff --git a/docs/visualization_2/images/viridis_colormap.png b/docs/visualization_2/images/viridis_colormap.png
new file mode 100644
index 000000000..37496838f
Binary files /dev/null and b/docs/visualization_2/images/viridis_colormap.png differ
diff --git a/docs/visualization_2/images/viridis_perceptually_uniform.png b/docs/visualization_2/images/viridis_perceptually_uniform.png
new file mode 100644
index 000000000..266f869ec
Binary files /dev/null and b/docs/visualization_2/images/viridis_perceptually_uniform.png differ
diff --git a/docs/visualization_2/images/wrong_scale_viz.png b/docs/visualization_2/images/wrong_scale_viz.png
new file mode 100644
index 000000000..c6cda3d97
Binary files /dev/null and b/docs/visualization_2/images/wrong_scale_viz.png differ
diff --git a/docs/visualization_2/visualization_2.html b/docs/visualization_2/visualization_2.html
new file mode 100644
index 000000000..dc4bd3087
--- /dev/null
+++ b/docs/visualization_2/visualization_2.html
@@ -0,0 +1,1958 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>8&nbsp; Visualization II – Principles and Techniques of Data Science</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../sampling/sampling.html" rel="next">
+<link href="../visualization_1/visualization_1.html" rel="prev">
+<link href="../data100_logo.png" rel="icon" type="image/png">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
+
+  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../visualization_2/visualization_2.html"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="pt-lg-2 mt-2 text-left sidebar-header sidebar-header-stacked">
+      <a href="../index.html" class="sidebar-logo-link">
+      <img src="../data100_logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      </a>
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Principles and Techniques of Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/DS-100/course-notes" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_lec/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_1/pandas_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Pandas I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_2/pandas_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Pandas II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pandas_3/pandas_3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Pandas III</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../eda/eda.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Cleaning and EDA</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../regex/regex.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Regular Expressions</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_1/visualization_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../visualization_2/visualization_2.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sampling/sampling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro_to_modeling/intro_to_modeling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Introduction to Modeling</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../constant_model_loss_transformations/loss_transformations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Constant Model, Loss, and Transformations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ols/ols.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Ordinary Least Squares</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../gradient_descent/gradient_descent.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">sklearn and Gradient Descent</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../feature_engineering/feature_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Feature Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../case_study_HCE/case_study_HCE.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Case Study in Human Contexts and Ethics</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../cv_regularization/cv_reg.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Cross Validation and Regularization</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_1/probability_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Random Variables</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../probability_2/probability_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Estimators, Bias, and Variance</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference_causality/inference_causality.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Causal Inference and Confounding</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_I/sql_I.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">SQL I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../sql_II/sql_II.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">SQL II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_1/logistic_reg_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Logistic Regression I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../logistic_regression_2/logistic_reg_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Logistic Regression II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_1/pca_1.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">PCA I</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../pca_2/pca_2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">PCA II</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../clustering/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#kernel-density-estimation" id="toc-kernel-density-estimation" class="nav-link active" data-scroll-target="#kernel-density-estimation"><span class="header-section-number">8.1</span> Kernel Density Estimation</a>
+  <ul>
+  <li><a href="#kde-theory" id="toc-kde-theory" class="nav-link" data-scroll-target="#kde-theory"><span class="header-section-number">8.1.1</span> KDE Theory</a></li>
+  <li><a href="#constructing-a-kde" id="toc-constructing-a-kde" class="nav-link" data-scroll-target="#constructing-a-kde"><span class="header-section-number">8.1.2</span> Constructing a KDE</a>
+  <ul>
+  <li><a href="#step-1-place-a-kernel-at-each-data-point" id="toc-step-1-place-a-kernel-at-each-data-point" class="nav-link" data-scroll-target="#step-1-place-a-kernel-at-each-data-point"><span class="header-section-number">8.1.2.1</span> Step 1: Place a Kernel at Each Data Point</a></li>
+  <li><a href="#step-2-normalize-kernels-to-have-a-total-area-of-1" id="toc-step-2-normalize-kernels-to-have-a-total-area-of-1" class="nav-link" data-scroll-target="#step-2-normalize-kernels-to-have-a-total-area-of-1"><span class="header-section-number">8.1.2.2</span> Step 2: Normalize Kernels to Have a Total Area of 1</a></li>
+  <li><a href="#step-3-sum-the-normalized-kernels" id="toc-step-3-sum-the-normalized-kernels" class="nav-link" data-scroll-target="#step-3-sum-the-normalized-kernels"><span class="header-section-number">8.1.2.3</span> Step 3: Sum the Normalized Kernels</a></li>
+  </ul></li>
+  <li><a href="#kernel-functions-and-bandwidths" id="toc-kernel-functions-and-bandwidths" class="nav-link" data-scroll-target="#kernel-functions-and-bandwidths"><span class="header-section-number">8.1.3</span> Kernel Functions and Bandwidths</a>
+  <ul>
+  <li><a href="#gaussian-kernel" id="toc-gaussian-kernel" class="nav-link" data-scroll-target="#gaussian-kernel"><span class="header-section-number">8.1.3.1</span> Gaussian Kernel</a></li>
+  <li><a href="#boxcar-kernel" id="toc-boxcar-kernel" class="nav-link" data-scroll-target="#boxcar-kernel"><span class="header-section-number">8.1.3.2</span> Boxcar Kernel</a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#diving-deeper-into-displot" id="toc-diving-deeper-into-displot" class="nav-link" data-scroll-target="#diving-deeper-into-displot"><span class="header-section-number">8.2</span> Diving Deeper into <code>displot</code></a></li>
+  <li><a href="#relationships-between-quantitative-variables" id="toc-relationships-between-quantitative-variables" class="nav-link" data-scroll-target="#relationships-between-quantitative-variables"><span class="header-section-number">8.3</span> Relationships Between Quantitative Variables</a>
+  <ul>
+  <li><a href="#scatter-plots" id="toc-scatter-plots" class="nav-link" data-scroll-target="#scatter-plots"><span class="header-section-number">8.3.0.1</span> Scatter Plots</a>
+  <ul>
+  <li><a href="#overplotting" id="toc-overplotting" class="nav-link" data-scroll-target="#overplotting"><span class="header-section-number">8.3.0.1.1</span> Overplotting</a></li>
+  </ul></li>
+  <li><a href="#lmplot-and-jointplot" id="toc-lmplot-and-jointplot" class="nav-link" data-scroll-target="#lmplot-and-jointplot"><span class="header-section-number">8.3.0.2</span> <code>lmplot</code> and <code>jointplot</code></a></li>
+  <li><a href="#hex-plots" id="toc-hex-plots" class="nav-link" data-scroll-target="#hex-plots"><span class="header-section-number">8.3.0.3</span> Hex plots</a></li>
+  <li><a href="#contour-plots" id="toc-contour-plots" class="nav-link" data-scroll-target="#contour-plots"><span class="header-section-number">8.3.0.4</span> Contour Plots</a></li>
+  </ul></li>
+  <li><a href="#transformations" id="toc-transformations" class="nav-link" data-scroll-target="#transformations"><span class="header-section-number">8.4</span> Transformations</a>
+  <ul>
+  <li><a href="#linearization-and-applying-transformations" id="toc-linearization-and-applying-transformations" class="nav-link" data-scroll-target="#linearization-and-applying-transformations"><span class="header-section-number">8.4.1</span> Linearization and Applying Transformations</a>
+  <ul>
+  <li><a href="#tukey-mosteller-bulge-diagram" id="toc-tukey-mosteller-bulge-diagram" class="nav-link" data-scroll-target="#tukey-mosteller-bulge-diagram"><span class="header-section-number">8.4.1.1</span> Tukey-Mosteller Bulge Diagram</a></li>
+  </ul></li>
+  <li><a href="#additional-remarks" id="toc-additional-remarks" class="nav-link" data-scroll-target="#additional-remarks"><span class="header-section-number">8.4.2</span> Additional Remarks</a></li>
+  </ul></li>
+  <li><a href="#visualization-theory" id="toc-visualization-theory" class="nav-link" data-scroll-target="#visualization-theory"><span class="header-section-number">8.5</span> Visualization Theory</a>
+  <ul>
+  <li><a href="#information-channels" id="toc-information-channels" class="nav-link" data-scroll-target="#information-channels"><span class="header-section-number">8.5.1</span> Information Channels</a>
+  <ul>
+  <li><a href="#encodings-in-rugplots" id="toc-encodings-in-rugplots" class="nav-link" data-scroll-target="#encodings-in-rugplots"><span class="header-section-number">8.5.1.1</span> Encodings in Rugplots</a></li>
+  <li><a href="#multi-dimensional-encodings" id="toc-multi-dimensional-encodings" class="nav-link" data-scroll-target="#multi-dimensional-encodings"><span class="header-section-number">8.5.1.2</span> Multi-Dimensional Encodings</a></li>
+  </ul></li>
+  <li><a href="#harnessing-the-axes" id="toc-harnessing-the-axes" class="nav-link" data-scroll-target="#harnessing-the-axes"><span class="header-section-number">8.5.2</span> Harnessing the Axes</a>
+  <ul>
+  <li><a href="#consider-the-scale-of-the-data" id="toc-consider-the-scale-of-the-data" class="nav-link" data-scroll-target="#consider-the-scale-of-the-data"><span class="header-section-number">8.5.2.1</span> Consider the Scale of the Data</a></li>
+  <li><a href="#reveal-the-data" id="toc-reveal-the-data" class="nav-link" data-scroll-target="#reveal-the-data"><span class="header-section-number">8.5.2.2</span> Reveal the Data</a></li>
+  </ul></li>
+  <li><a href="#harnessing-color" id="toc-harnessing-color" class="nav-link" data-scroll-target="#harnessing-color"><span class="header-section-number">8.5.3</span> Harnessing Color</a>
+  <ul>
+  <li><a href="#colormaps" id="toc-colormaps" class="nav-link" data-scroll-target="#colormaps"><span class="header-section-number">8.5.3.1</span> Colormaps</a></li>
+  </ul></li>
+  <li><a href="#harnessing-markings" id="toc-harnessing-markings" class="nav-link" data-scroll-target="#harnessing-markings"><span class="header-section-number">8.5.4</span> Harnessing Markings</a></li>
+  <li><a href="#harnessing-conditioning" id="toc-harnessing-conditioning" class="nav-link" data-scroll-target="#harnessing-conditioning"><span class="header-section-number">8.5.5</span> Harnessing Conditioning</a></li>
+  <li><a href="#harnessing-context" id="toc-harnessing-context" class="nav-link" data-scroll-target="#harnessing-context"><span class="header-section-number">8.5.6</span> Harnessing Context</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Visualization II</span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="callout callout-style-default callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Learning Outcomes
+</div>
+<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
+</div>
+<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
+<div class="callout-body-container callout-body">
+<ul>
+<li>Understanding KDE for plotting distributions and estimating density curves.</li>
+<li>Using transformations to analyze the relationship between two variables.</li>
+<li>Evaluating the quality of a visualization based on visualization theory concepts.</li>
+</ul>
+</div>
+</div>
+</div>
+<section id="kernel-density-estimation" class="level2" data-number="8.1">
+<h2 data-number="8.1" class="anchored" data-anchor-id="kernel-density-estimation"><span class="header-section-number">8.1</span> Kernel Density Estimation</h2>
+<p>Often, we want to identify general trends across a distribution, rather than focus on detail. Smoothing a distribution helps generalize the structure of the data and eliminate noise.</p>
+<section id="kde-theory" class="level3" data-number="8.1.1">
+<h3 data-number="8.1.1" class="anchored" data-anchor-id="kde-theory"><span class="header-section-number">8.1.1</span> KDE Theory</h3>
+<p>A <strong>kernel density estimate (KDE)</strong> is a smooth, continuous function that approximates a curve. It allows us to represent general trends in a distribution without focusing on the details, which is useful for analyzing the broad structure of a dataset.</p>
+<p>More formally, a KDE attempts to approximate the underlying <strong>probability distribution</strong> from which our dataset was drawn. You may have encountered the idea of a probability distribution in your other classes; if not, we’ll discuss it at length in the next lecture. For now, you can think of a probability distribution as a description of how likely it is for us to sample a particular value in our dataset.</p>
+<p>A KDE curve estimates the probability density function of a random variable. Consider the example below, where we have used <code>sns.displot</code> to plot both a histogram (containing the data points we actually collected) and a KDE curve (representing the <em>approximated</em> probability distribution from which this data was drawn) using data from the World Bank dataset (<code>wb</code>).</p>
+<div id="5063cf64" class="cell" data-execution_count="1">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>wb <span class="op">=</span> pd.read_csv(<span class="st">"data/world_bank.csv"</span>, index_col<span class="op">=</span><span class="dv">0</span>)</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>wb <span class="op">=</span> wb.rename(columns<span class="op">=</span>{<span class="st">'Antiretroviral therapy coverage: </span><span class="sc">% o</span><span class="st">f people living with HIV: 2015'</span>:<span class="st">"HIV rate"</span>,</span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a>                       <span class="st">'Gross national income per capita, Atlas method: $: 2016'</span>:<span class="st">'gni'</span>})</span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a>wb.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display" data-execution_count="1">
+<div>
+
+
+<table class="dataframe caption-top table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Continent</th>
+<th data-quarto-table-cell-role="th">Country</th>
+<th data-quarto-table-cell-role="th">Primary completion rate: Male: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Primary completion rate: Female: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Lower secondary completion rate: Male: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Lower secondary completion rate: Female: % of relevant age group: 2015</th>
+<th data-quarto-table-cell-role="th">Youth literacy rate: Male: % of ages 15-24: 2005-14</th>
+<th data-quarto-table-cell-role="th">Youth literacy rate: Female: % of ages 15-24: 2005-14</th>
+<th data-quarto-table-cell-role="th">Adult literacy rate: Male: % ages 15 and older: 2005-14</th>
+<th data-quarto-table-cell-role="th">Adult literacy rate: Female: % ages 15 and older: 2005-14</th>
+<th data-quarto-table-cell-role="th">...</th>
+<th data-quarto-table-cell-role="th">Access to improved sanitation facilities: % of population: 1990</th>
+<th data-quarto-table-cell-role="th">Access to improved sanitation facilities: % of population: 2015</th>
+<th data-quarto-table-cell-role="th">Child immunization rate: Measles: % of children ages 12-23 months: 2015</th>
+<th data-quarto-table-cell-role="th">Child immunization rate: DTP3: % of children ages 12-23 months: 2015</th>
+<th data-quarto-table-cell-role="th">Children with acute respiratory infection taken to health provider: % of children under age 5 with ARI: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Children with diarrhea who received oral rehydration and continuous feeding: % of children under age 5 with diarrhea: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Children sleeping under treated bed nets: % of children under age 5: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Children with fever receiving antimalarial drugs: % of children under age 5 with fever: 2009-2016</th>
+<th data-quarto-table-cell-role="th">Tuberculosis: Treatment success rate: % of new cases: 2014</th>
+<th data-quarto-table-cell-role="th">Tuberculosis: Cases detection rate: % of new estimated cases: 2015</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>Africa</td>
+<td>Algeria</td>
+<td>106.0</td>
+<td>105.0</td>
+<td>68.0</td>
+<td>85.0</td>
+<td>96.0</td>
+<td>92.0</td>
+<td>83.0</td>
+<td>68.0</td>
+<td>...</td>
+<td>80.0</td>
+<td>88.0</td>
+<td>95.0</td>
+<td>95.0</td>
+<td>66.0</td>
+<td>42.0</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>88.0</td>
+<td>80.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>Africa</td>
+<td>Angola</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>79.0</td>
+<td>67.0</td>
+<td>82.0</td>
+<td>60.0</td>
+<td>...</td>
+<td>22.0</td>
+<td>52.0</td>
+<td>55.0</td>
+<td>64.0</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>25.9</td>
+<td>28.3</td>
+<td>34.0</td>
+<td>64.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>Africa</td>
+<td>Benin</td>
+<td>83.0</td>
+<td>73.0</td>
+<td>50.0</td>
+<td>37.0</td>
+<td>55.0</td>
+<td>31.0</td>
+<td>41.0</td>
+<td>18.0</td>
+<td>...</td>
+<td>7.0</td>
+<td>20.0</td>
+<td>75.0</td>
+<td>79.0</td>
+<td>23.0</td>
+<td>33.0</td>
+<td>72.7</td>
+<td>25.9</td>
+<td>89.0</td>
+<td>61.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>Africa</td>
+<td>Botswana</td>
+<td>98.0</td>
+<td>101.0</td>
+<td>86.0</td>
+<td>87.0</td>
+<td>96.0</td>
+<td>99.0</td>
+<td>87.0</td>
+<td>89.0</td>
+<td>...</td>
+<td>39.0</td>
+<td>63.0</td>
+<td>97.0</td>
+<td>95.0</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>NaN</td>
+<td>77.0</td>
+<td>62.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">5</td>
+<td>Africa</td>
+<td>Burundi</td>
+<td>58.0</td>
+<td>66.0</td>
+<td>35.0</td>
+<td>30.0</td>
+<td>90.0</td>
+<td>88.0</td>
+<td>89.0</td>
+<td>85.0</td>
+<td>...</td>
+<td>42.0</td>
+<td>48.0</td>
+<td>93.0</td>
+<td>94.0</td>
+<td>55.0</td>
+<td>43.0</td>
+<td>53.8</td>
+<td>25.4</td>
+<td>91.0</td>
+<td>51.0</td>
+</tr>
+</tbody>
+</table>
+
+<p>5 rows × 47 columns</p>
+</div>
+</div>
+</div>
+<div id="9fcd76bd" class="cell" data-execution_count="2">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>sns.displot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">'HIV rate'</span>, <span class="op">\</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>                       kde <span class="op">=</span> <span class="va">True</span>, stat <span class="op">=</span> <span class="st">"density"</span>)</span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Distribution of HIV rates"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-3-output-1.png" width="470" height="490" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Notice that the smooth KDE curve is higher when the histogram bins are taller. You can think of the height of the KDE curve as representing how “probable” it is that we randomly sample a datapoint with the corresponding value. This intuitively makes sense – if we have already collected more datapoints with a particular value (resulting in a tall histogram bin), it is more likely that, if we randomly sample another datapoint, we will sample one with a similar value (resulting in a high KDE curve).</p>
+<p>The area under a probability density function should always integrate to 1, representing the fact that the total probability of a distribution should always sum to 100%. Hence, a KDE curve will always have an area under the curve of 1.</p>
+</section>
+<section id="constructing-a-kde" class="level3" data-number="8.1.2">
+<h3 data-number="8.1.2" class="anchored" data-anchor-id="constructing-a-kde"><span class="header-section-number">8.1.2</span> Constructing a KDE</h3>
+<p>We perform kernel density estimation using three steps.</p>
+<ol type="1">
+<li>Place a kernel at each datapoint.</li>
+<li>Normalize the kernels to have a total area of 1 (across all kernels).</li>
+<li>Sum the normalized kernels.</li>
+</ol>
+<p>We’ll explain what a “kernel” is momentarily.</p>
+<p>To make things simpler, let’s construct a KDE for a small, artificially generated dataset of 5 datapoints: <span class="math inline">\([2.2, 2.8, 3.7, 5.3, 5.7]\)</span>. In the plot below, each vertical bar represents one data point.</p>
+<div id="ab72f1c1" class="cell" data-execution_count="3">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>data <span class="op">=</span> [<span class="fl">2.2</span>, <span class="fl">2.8</span>, <span class="fl">3.7</span>, <span class="fl">5.3</span>, <span class="fl">5.7</span>]</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>sns.rugplot(data, height<span class="op">=</span><span class="fl">0.3</span>)</span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Data"</span>)</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Density"</span>)</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>plt.xlim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">10</span>)</span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>plt.ylim(<span class="dv">0</span>, <span class="fl">0.5</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-4-output-1.png" width="597" height="434" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Our goal is to create the following KDE curve, which was generated automatically by <code>sns.kdeplot</code>.</p>
+<div id="d027923e" class="cell" data-execution_count="4">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>sns.kdeplot(data)</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Data"</span>)</span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>plt.xlim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">10</span>)</span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>plt.ylim(<span class="dv">0</span>, <span class="fl">0.5</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-5-output-1.png" width="597" height="434" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<section id="step-1-place-a-kernel-at-each-data-point" class="level4" data-number="8.1.2.1">
+<h4 data-number="8.1.2.1" class="anchored" data-anchor-id="step-1-place-a-kernel-at-each-data-point"><span class="header-section-number">8.1.2.1</span> Step 1: Place a Kernel at Each Data Point</h4>
+<p>To begin generating a density curve, we need to choose a <strong>kernel</strong> and <strong>bandwidth value (<span class="math inline">\(\alpha\)</span>)</strong>. What are these exactly?</p>
+<p>A <strong>kernel</strong> is a density curve. It is the mathematical function that attempts to capture the randomness of each data point in our sampled data. To explain what this means, consider just <em>one</em> of the datapoints in our dataset: <span class="math inline">\(2.2\)</span>. We obtained this datapoint by randomly sampling some information out in the real world (you can imagine <span class="math inline">\(2.2\)</span> as representing a single measurement taken in an experiment, for example). If we were to sample a new datapoint, we may obtain a slightly different value. It could be higher than <span class="math inline">\(2.2\)</span>; it could also be lower than <span class="math inline">\(2.2\)</span>. We make the assumption that any future sampled datapoints will likely be similar in value to the data we’ve already drawn. This means that our <em>kernel</em> – our description of the probability of randomly sampling any new value – will be greatest at the datapoint we’ve already drawn but still have non-zero probability above and below it. The area under any kernel should integrate to 1, representing the total probability of drawing a new datapoint.</p>
+<p>A <strong>bandwidth value</strong>, usually denoted by <span class="math inline">\(\alpha\)</span>, represents the width of the kernel. A large value of <span class="math inline">\(\alpha\)</span> will result in a wide, short kernel function, while a small value with result in a narrow, tall kernel.</p>
+<p>Below, we place a <strong>Gaussian kernel</strong>, plotted in orange, over the datapoint <span class="math inline">\(2.2\)</span>. A Gaussian kernel is simply the normal distribution, which you may have called a bell curve in Data 8.</p>
+<div id="438458ac" class="cell" data-execution_count="5">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gaussian_kernel(x, z, a):</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    <span class="co"># We'll discuss where this mathematical formulation came from later</span></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (<span class="dv">1</span><span class="op">/</span>np.sqrt(<span class="dv">2</span><span class="op">*</span>np.pi<span class="op">*</span>a<span class="op">**</span><span class="dv">2</span>)) <span class="op">*</span> np.exp((<span class="op">-</span>(x <span class="op">-</span> z)<span class="op">**</span><span class="dv">2</span> <span class="op">/</span> (<span class="dv">2</span> <span class="op">*</span> a<span class="op">**</span><span class="dv">2</span>)))</span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot our datapoint</span></span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>sns.rugplot([<span class="fl">2.2</span>], height<span class="op">=</span><span class="fl">0.3</span>)</span>
+<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the kernel</span></span>
+<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">3</span>, <span class="dv">10</span>, <span class="dv">1000</span>)</span>
+<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>plt.plot(x, gaussian_kernel(x, <span class="fl">2.2</span>, <span class="dv">1</span>))</span>
+<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Data"</span>)</span>
+<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Density"</span>)</span>
+<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a>plt.xlim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">10</span>)</span>
+<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a>plt.ylim(<span class="dv">0</span>, <span class="fl">0.5</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-6-output-1.png" width="597" height="434" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>To begin creating our KDE, we place a kernel on <em>each</em> datapoint in our dataset. For our dataset of 5 points, we will have 5 kernels.</p>
+<div id="6500afb3" class="cell" data-execution_count="6">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># You will work with the functions below in Lab 4</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> create_kde(kernel, pts, a):</span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Takes in a kernel, set of points, and alpha</span></span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Returns the KDE as a function</span></span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>    <span class="kw">def</span> f(x):</span>
+<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>        output <span class="op">=</span> <span class="dv">0</span></span>
+<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>        <span class="cf">for</span> pt <span class="kw">in</span> pts:</span>
+<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>            output <span class="op">+=</span> kernel(x, pt, a)</span>
+<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a>        <span class="cf">return</span> output <span class="op">/</span> <span class="bu">len</span>(pts) <span class="co"># Normalization factor</span></span>
+<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> f</span>
+<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> plot_kde(kernel, pts, a):</span>
+<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Calls create_kde and plots the corresponding KDE</span></span>
+<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a>    f <span class="op">=</span> create_kde(kernel, pts, a)</span>
+<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> np.linspace(<span class="bu">min</span>(pts) <span class="op">-</span> <span class="dv">5</span>, <span class="bu">max</span>(pts) <span class="op">+</span> <span class="dv">5</span>, <span class="dv">1000</span>)</span>
+<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a>    y <span class="op">=</span> [f(xi) <span class="cf">for</span> xi <span class="kw">in</span> x]</span>
+<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a>    plt.plot(x, y)<span class="op">;</span></span>
+<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> plot_separate_kernels(kernel, pts, a, norm<span class="op">=</span><span class="va">False</span>):</span>
+<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Plots individual kernels, which are then summed to create the KDE</span></span>
+<span id="cb6-21"><a href="#cb6-21" aria-hidden="true" tabindex="-1"></a>    x <span class="op">=</span> np.linspace(<span class="bu">min</span>(pts) <span class="op">-</span> <span class="dv">5</span>, <span class="bu">max</span>(pts) <span class="op">+</span> <span class="dv">5</span>, <span class="dv">1000</span>)</span>
+<span id="cb6-22"><a href="#cb6-22" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> pt <span class="kw">in</span> pts:</span>
+<span id="cb6-23"><a href="#cb6-23" aria-hidden="true" tabindex="-1"></a>        y <span class="op">=</span> kernel(x, pt, a)</span>
+<span id="cb6-24"><a href="#cb6-24" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> norm:</span>
+<span id="cb6-25"><a href="#cb6-25" aria-hidden="true" tabindex="-1"></a>            y <span class="op">/=</span> <span class="bu">len</span>(pts)</span>
+<span id="cb6-26"><a href="#cb6-26" aria-hidden="true" tabindex="-1"></a>        plt.plot(x, y)</span>
+<span id="cb6-27"><a href="#cb6-27" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb6-28"><a href="#cb6-28" aria-hidden="true" tabindex="-1"></a>    plt.show()<span class="op">;</span></span>
+<span id="cb6-29"><a href="#cb6-29" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb6-30"><a href="#cb6-30" aria-hidden="true" tabindex="-1"></a>plt.xlim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">10</span>)</span>
+<span id="cb6-31"><a href="#cb6-31" aria-hidden="true" tabindex="-1"></a>plt.ylim(<span class="dv">0</span>, <span class="fl">0.5</span>)</span>
+<span id="cb6-32"><a href="#cb6-32" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Data"</span>)</span>
+<span id="cb6-33"><a href="#cb6-33" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Density"</span>)</span>
+<span id="cb6-34"><a href="#cb6-34" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-35"><a href="#cb6-35" aria-hidden="true" tabindex="-1"></a>plot_separate_kernels(gaussian_kernel, data, a <span class="op">=</span> <span class="dv">1</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-7-output-1.png" width="597" height="434" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="step-2-normalize-kernels-to-have-a-total-area-of-1" class="level4" data-number="8.1.2.2">
+<h4 data-number="8.1.2.2" class="anchored" data-anchor-id="step-2-normalize-kernels-to-have-a-total-area-of-1"><span class="header-section-number">8.1.2.2</span> Step 2: Normalize Kernels to Have a Total Area of 1</h4>
+<p>Above, we said that <em>each</em> kernel has an area of 1. Earlier, we also said that our goal is to construct a KDE curve using these kernels with a <em>total</em> area of 1. If we were to directly sum the kernels as they are, we would produce a KDE curve with an integrated area of (5 kernels) <span class="math inline">\(\times\)</span> (area of 1 each) = 5. To avoid this, we will <strong>normalize</strong> each of our kernels. This involves multiplying each kernel by <span class="math inline">\(\frac{1}{\#\:\text{datapoints}}\)</span>.</p>
+<p>In the cell below, we multiply each of our 5 kernels by <span class="math inline">\(\frac{1}{5}\)</span> to apply normalization.</p>
+<div id="b2083772" class="cell" data-execution_count="7">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>plt.xlim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">10</span>)</span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>plt.ylim(<span class="dv">0</span>, <span class="fl">0.5</span>)</span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Data"</span>)</span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Density"</span>)</span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a><span class="co"># The `norm` argument specifies whether or not to normalize the kernels</span></span>
+<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>plot_separate_kernels(gaussian_kernel, data, a <span class="op">=</span> <span class="dv">1</span>, norm <span class="op">=</span> <span class="va">True</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-8-output-1.png" width="597" height="434" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="step-3-sum-the-normalized-kernels" class="level4" data-number="8.1.2.3">
+<h4 data-number="8.1.2.3" class="anchored" data-anchor-id="step-3-sum-the-normalized-kernels"><span class="header-section-number">8.1.2.3</span> Step 3: Sum the Normalized Kernels</h4>
+<p>Our KDE curve is the sum of the normalized kernels. Notice that the final curve is identical to the plot generated by <code>sns.kdeplot</code> we saw earlier!</p>
+<div id="f3af0fb3" class="cell" data-execution_count="8">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>plt.xlim(<span class="op">-</span><span class="dv">3</span>, <span class="dv">10</span>)</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>plt.ylim(<span class="dv">0</span>, <span class="fl">0.5</span>)</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Data"</span>)</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Density"</span>)</span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>plot_kde(gaussian_kernel, data, a <span class="op">=</span> <span class="dv">1</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-9-output-1.png" width="597" height="434" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="kernel-functions-and-bandwidths" class="level3" data-number="8.1.3">
+<h3 data-number="8.1.3" class="anchored" data-anchor-id="kernel-functions-and-bandwidths"><span class="header-section-number">8.1.3</span> Kernel Functions and Bandwidths</h3>
+<center>
+<img src="images/kde_function.png" width="345">
+</center>
+<p>A general “KDE formula” function is given above.</p>
+<ol type="1">
+<li><span class="math inline">\(K_{\alpha}(x, x_i)\)</span> is the kernel centered on the observation <code>i</code>.
+<ul>
+<li>Each kernel individually has area 1.</li>
+<li>x represents any number on the number line. It is the input to our function.</li>
+</ul></li>
+<li><span class="math inline">\(n\)</span> is the number of observed datapoints that we have.
+<ul>
+<li>We multiply by <span class="math inline">\(\frac{1}{n}\)</span> so that the total area of the KDE is still 1.</li>
+</ul></li>
+<li>Each <span class="math inline">\(x_i \in \{x_1, x_2, \dots, x_n\}\)</span> represents an observed datapoint.
+<ul>
+<li>These are what we use to create our KDE by summing multiple shifted kernels centered at these points.</li>
+</ul></li>
+</ol>
+<ul>
+<li><span class="math inline">\(\alpha\)</span> (alpha) is the bandwidth or smoothing parameter.</li>
+</ul>
+<p>A <strong>kernel</strong> (for our purposes) is a valid density function. This means it:</p>
+<ul>
+<li>Must be non-negative for all inputs.</li>
+<li>Must integrate to 1.</li>
+</ul>
+<section id="gaussian-kernel" class="level4" data-number="8.1.3.1">
+<h4 data-number="8.1.3.1" class="anchored" data-anchor-id="gaussian-kernel"><span class="header-section-number">8.1.3.1</span> Gaussian Kernel</h4>
+<p>The most common kernel is the <strong>Gaussian kernel</strong>. The Gaussian kernel is equivalent to the Gaussian probability density function (the Normal distribution), centered at the observed value with a standard deviation of (this is known as the <strong>bandwidth</strong> parameter).</p>
+<p><span class="math display">\[K_a(x, x_i) = \frac{1}{\sqrt{2\pi\alpha^{2}}}e^{-\frac{(x-x_i)^{2}}{2\alpha^{2}}}\]</span></p>
+<p>In this formula:</p>
+<ul>
+<li><span class="math inline">\(x\)</span> (no subscript) represents any value along the x-axis of our plot</li>
+<li><span class="math inline">\(x_i\)</span> represents the <span class="math inline">\(i\)</span> -th datapoint in our dataset. It is one of the values that we have actually collected in our data sampling process. In our example earlier, <span class="math inline">\(x_i=2.2\)</span>. Those of you who have taken a probability class may recognize <span class="math inline">\(x_i\)</span> as the <strong>mean</strong> of the normal distribution.</li>
+<li>Each kernel is <strong>centered</strong> on our observed values, so its distribution mean is <span class="math inline">\(x_i\)</span>.</li>
+<li><span class="math inline">\(\alpha\)</span> is the bandwidth parameter, representing the width of our kernel. More formally, <span class="math inline">\(\alpha\)</span> is the <strong>standard deviation</strong> of the Gaussian curve.
+<ul>
+<li>A large value of <span class="math inline">\(\alpha\)</span> will produce a kernel that is wider and shorter – this leads to a smoother KDE when the kernels are summed together.</li>
+<li>A small value of <span class="math inline">\(\alpha\)</span> will produce a narrower, taller kernel, and, with it, a noisier KDE.</li>
+</ul></li>
+</ul>
+<p>The details of this (admittedly intimidating) formula are less important than understanding its role in kernel density estimation – this equation gives us the shape of each kernel.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 50%">
+<col style="width: 50%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Gaussian Kernel, <span class="math inline">\(\alpha\)</span> = 0.1</th>
+<th>Gaussian Kernel, <span class="math inline">\(\alpha\)</span> = 1</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><img src="images/gaussian_0.1.png" alt="gaussian_0.1" width="345"></td>
+<td><img src="images/gaussian_1.png" alt="gaussian_1" width="345"></td>
+</tr>
+</tbody>
+</table>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 50%">
+<col style="width: 50%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Gaussian Kernel, <span class="math inline">\(\alpha\)</span> = 2</th>
+<th>Gaussian Kernel, <span class="math inline">\(\alpha\)</span> = 10</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><img src="images/gaussian_2.png" alt="gaussian_2" width="345"></td>
+<td><img src="images/gaussian_10.png" alt="gaussian_10" width="345"></td>
+</tr>
+</tbody>
+</table>
+</section>
+<section id="boxcar-kernel" class="level4" data-number="8.1.3.2">
+<h4 data-number="8.1.3.2" class="anchored" data-anchor-id="boxcar-kernel"><span class="header-section-number">8.1.3.2</span> Boxcar Kernel</h4>
+<p>Another example of a kernel is the <strong>Boxcar kernel</strong>. The boxcar kernel assigns a uniform density to points within a “window” of the observation, and a density of 0 elsewhere. The equation below is a boxcar kernel with the center at <span class="math inline">\(x_i\)</span> and the bandwidth of <span class="math inline">\(\alpha\)</span>.</p>
+<p><span class="math display">\[K_a(x, x_i) = \begin{cases}
+        \frac{1}{\alpha}, &amp; |x - x_i| \le \frac{\alpha}{2}\\
+        0, &amp; \text{else }
+    \end{cases}\]</span></p>
+<p>The boxcar kernel is seldom used in practice – we include it here to demonstrate that a kernel function can take whatever form you would like, provided it integrates to 1 and does not output negative values.</p>
+<div id="803ddba5" class="cell" data-execution_count="9">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> boxcar_kernel(alpha, x, z):</span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (((x<span class="op">-</span>z)<span class="op">&gt;=-</span>alpha<span class="op">/</span><span class="dv">2</span>)<span class="op">&amp;</span>((x<span class="op">-</span>z)<span class="op">&lt;=</span>alpha<span class="op">/</span><span class="dv">2</span>))<span class="op">/</span>alpha</span>
+<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>xs <span class="op">=</span> np.linspace(<span class="op">-</span><span class="dv">5</span>, <span class="dv">5</span>, <span class="dv">200</span>)</span>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>alpha<span class="op">=</span><span class="dv">1</span></span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>kde_curve <span class="op">=</span> [boxcar_kernel(alpha, x, <span class="dv">0</span>) <span class="cf">for</span> x <span class="kw">in</span> xs]</span>
+<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>plt.plot(xs, kde_curve)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-10-output-1.png" width="571" height="411" class="figure-img"></p>
+<figcaption>The Boxcar kernel centered at 0 with bandwidth <span class="math inline">\(\alpha\)</span> = 1.</figcaption>
+</figure>
+</div>
+</div>
+</div>
+<p>The diagram on the right is how the density curve for our 5 point dataset would have looked had we used the Boxcar kernel with bandwidth <span class="math inline">\(\alpha\)</span> = 1.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 50%">
+<col style="width: 50%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>KDE</th>
+<th>Boxcar</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><img src="images/gaussian_kernel.png" alt="kde_step_3" width="350"></td>
+<td><img src="images/boxcar_kernel.png" alt="boxcar_kernel" width="350"></td>
+</tr>
+</tbody>
+</table>
+</section>
+</section>
+</section>
+<section id="diving-deeper-into-displot" class="level2" data-number="8.2">
+<h2 data-number="8.2" class="anchored" data-anchor-id="diving-deeper-into-displot"><span class="header-section-number">8.2</span> Diving Deeper into <code>displot</code></h2>
+<p>As we saw earlier, we can use <code>seaborn</code>’s <code>displot</code> function to plot various distributions. In particular, <code>displot</code> allows you to specify the <code>kind</code> of plot and is a wrapper for <code>histplot</code>, <code>kdeplot</code>, and <code>ecdfplot</code>.</p>
+<p>Below, we can see a couple of examples of how <code>sns.displot</code> can be used to plot various distributions.</p>
+<p>First, we can plot a histogram by setting <code>kind</code> to <code>"hist"</code>. Note that here we’ve specified <code>stat = density</code> to normalize the histogram such that the area under the histogram is equal to 1.</p>
+<div id="05576df7" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>sns.displot(data<span class="op">=</span>wb, </span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>            x<span class="op">=</span><span class="st">"gni"</span>, </span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>            kind<span class="op">=</span><span class="st">"hist"</span>, </span>
+<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>            stat<span class="op">=</span><span class="st">"density"</span>) <span class="co"># default: stat=count and density integrates to 1</span></span>
+<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Distribution of gross national income per capita"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-11-output-1.png" width="476" height="490" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Now, what if we want to generate a KDE plot? We can set <code>kind</code> = to <code>"kde"</code>!</p>
+<div id="f7260558" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>sns.displot(data<span class="op">=</span>wb, </span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>            x<span class="op">=</span><span class="st">"gni"</span>, </span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>            kind<span class="op">=</span><span class="st">'kde'</span>)</span>
+<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Distribution of gross national income per capita"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-12-output-1.png" width="471" height="490" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>And finally, if we want to generate an Empirical Cumulative Distribution Function (ECDF), we can specify <code>kind = "ecdf"</code>.</p>
+<div id="57be1e90" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>sns.displot(data<span class="op">=</span>wb, </span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>            x<span class="op">=</span><span class="st">"gni"</span>, </span>
+<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>            kind<span class="op">=</span><span class="st">'ecdf'</span>)</span>
+<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Cumulative Distribution of gross national income per capita"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-13-output-1.png" width="508" height="485" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="relationships-between-quantitative-variables" class="level2" data-number="8.3">
+<h2 data-number="8.3" class="anchored" data-anchor-id="relationships-between-quantitative-variables"><span class="header-section-number">8.3</span> Relationships Between Quantitative Variables</h2>
+<p>Up until now, we’ve discussed how to visualize single-variable distributions. Going beyond this, we want to understand the relationship between pairs of numerical variables.</p>
+<section id="scatter-plots" class="level4" data-number="8.3.0.1">
+<h4 data-number="8.3.0.1" class="anchored" data-anchor-id="scatter-plots"><span class="header-section-number">8.3.0.1</span> Scatter Plots</h4>
+<p><strong>Scatter plots</strong> are one of the most useful tools in representing the relationship between <strong>pairs</strong> of quantitative variables. They are particularly important in gauging the strength, or correlation, of the relationship between variables. Knowledge of these relationships can then motivate decisions in our modeling process.</p>
+<p>In <code>matplotlib</code>, we use the function <code>plt.scatter</code> to generate a scatter plot. Notice that, unlike our examples of plotting single-variable distributions, now we specify sequences of values to be plotted along the x-axis <em>and</em> the y-axis.</p>
+<div id="9b23d6cf" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>plt.scatter(wb[<span class="st">"per capita: </span><span class="sc">% g</span><span class="st">rowth: 2016"</span>], <span class="op">\</span></span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>            wb[<span class="st">'Adult literacy rate: Female: % ages 15 and older: 2005-14'</span>])</span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"</span><span class="sc">% g</span><span class="st">rowth per capita"</span>)</span>
+<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Female adult literacy rate"</span>)</span>
+<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Female adult literacy against </span><span class="sc">% g</span><span class="st">rowth"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-14-output-1.png" width="593" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>In <code>seaborn</code>, we call the function <code>sns.scatterplot</code>. We use the <code>x</code> and <code>y</code> parameters to indicate the values to be plotted along the x and y axes, respectively. By using the <code>hue</code> parameter, we can specify a third variable to be used for coloring each scatter point.</p>
+<div id="7a3c8c88" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">"per capita: </span><span class="sc">% g</span><span class="st">rowth: 2016"</span>, <span class="op">\</span></span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>               y <span class="op">=</span> <span class="st">"Adult literacy rate: Female: % ages 15 and older: 2005-14"</span>, </span>
+<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>               hue <span class="op">=</span> <span class="st">"Continent"</span>)</span>
+<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Female adult literacy against </span><span class="sc">% g</span><span class="st">rowth"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-15-output-1.png" width="593" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<section id="overplotting" class="level5" data-number="8.3.0.1.1">
+<h5 data-number="8.3.0.1.1" class="anchored" data-anchor-id="overplotting"><span class="header-section-number">8.3.0.1.1</span> Overplotting</h5>
+<p>Although the plots above communicate the general relationship between the two plotted variables, they both suffer a major limitation – <strong>overplotting</strong>. Overplotting occurs when scatter points with similar values are stacked on top of one another, making it difficult to see the number of scatter points actually plotted in the visualization. Notice how in the upper righthand region of the plots, we cannot easily tell just how many points have been plotted. This makes our visualizations difficult to interpret.</p>
+<p>We have a few methods to help reduce overplotting:</p>
+<ul>
+<li>Decreasing the size of the scatter point markers can improve readability. We do this by setting a new value to the size parameter, <code>s</code>, of <code>plt.scatter</code> or <code>sns.scatterplot</code>.</li>
+<li><strong>Jittering</strong> is the process of adding a small amount of random noise to all x and y values to slightly shift the position of each datapoint. By randomly shifting all the data by some small distance, we can discern individual points more clearly without modifying the major trends of the original dataset.</li>
+</ul>
+<p>In the cell below, we first jitter the data using <code>np.random.uniform</code>, then re-plot it with smaller markers. The resulting plot is much easier to interpret.</p>
+<div id="e6117d89" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Setting a seed ensures that we produce the same plot each time</span></span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="co"># This means that the course notes will not change each time you access them</span></span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">150</span>)</span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="co"># This call to np.random.uniform generates random numbers between -1 and 1</span></span>
+<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="co"># We add these random numbers to the original x data to jitter it slightly</span></span>
+<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a>x_noise <span class="op">=</span> np.random.uniform(<span class="op">-</span><span class="dv">1</span>, <span class="dv">1</span>, <span class="bu">len</span>(wb))</span>
+<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a>jittered_x <span class="op">=</span> wb[<span class="st">"per capita: </span><span class="sc">% g</span><span class="st">rowth: 2016"</span>] <span class="op">+</span> x_noise</span>
+<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Repeat for y data</span></span>
+<span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a>y_noise <span class="op">=</span> np.random.uniform(<span class="op">-</span><span class="dv">5</span>, <span class="dv">5</span>, <span class="bu">len</span>(wb))</span>
+<span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a>jittered_y <span class="op">=</span> wb[<span class="st">"Adult literacy rate: Female: % ages 15 and older: 2005-14"</span>] <span class="op">+</span> y_noise</span>
+<span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Setting the size parameter `s` changes the size of each point</span></span>
+<span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a>plt.scatter(jittered_x, jittered_y, s<span class="op">=</span><span class="dv">15</span>)</span>
+<span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"</span><span class="sc">% g</span><span class="st">rowth per capita (jittered)"</span>)</span>
+<span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Female adult literacy rate (jittered)"</span>)</span>
+<span id="cb15-19"><a href="#cb15-19" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Female adult literacy against </span><span class="sc">% g</span><span class="st">rowth"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-16-output-1.png" width="593" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="lmplot-and-jointplot" class="level4" data-number="8.3.0.2">
+<h4 data-number="8.3.0.2" class="anchored" data-anchor-id="lmplot-and-jointplot"><span class="header-section-number">8.3.0.2</span> <code>lmplot</code> and <code>jointplot</code></h4>
+<p><code>seaborn</code> also includes several built-in functions for creating more sophisticated scatter plots. Two of the most commonly used examples are <code>sns.lmplot</code> and <code>sns.jointplot</code>.</p>
+<p><code>sns.lmplot</code> plots both a scatter plot <em>and</em> a linear regression line, all in one function call. We’ll discuss linear regression in a few lectures.</p>
+<div id="5cb6d001" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>sns.lmplot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">"per capita: </span><span class="sc">% g</span><span class="st">rowth: 2016"</span>, <span class="op">\</span></span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>           y <span class="op">=</span> <span class="st">"Adult literacy rate: Female: % ages 15 and older: 2005-14"</span>)</span>
+<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Female adult literacy against </span><span class="sc">% g</span><span class="st">rowth"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-17-output-1.png" width="470" height="490" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p><code>sns.jointplot</code> creates a visualization with three components: a scatter plot, a histogram of the distribution of x values, and a histogram of the distribution of y values.</p>
+<div id="c164b70e" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>sns.jointplot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">"per capita: </span><span class="sc">% g</span><span class="st">rowth: 2016"</span>, <span class="op">\</span></span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>           y <span class="op">=</span> <span class="st">"Adult literacy rate: Female: % ages 15 and older: 2005-14"</span>)</span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.suptitle allows us to shift the title up so it does not overlap with the histogram</span></span>
+<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a>plt.suptitle(<span class="st">"Female adult literacy against </span><span class="sc">% g</span><span class="st">rowth"</span>)</span>
+<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a>plt.subplots_adjust(top<span class="op">=</span><span class="fl">0.9</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-18-output-1.png" width="570" height="569" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="hex-plots" class="level4" data-number="8.3.0.3">
+<h4 data-number="8.3.0.3" class="anchored" data-anchor-id="hex-plots"><span class="header-section-number">8.3.0.3</span> Hex plots</h4>
+<p>For datasets with a very large number of datapoints, jittering is unlikely to fully resolve the issue of overplotting. In these cases, we can attempt to visualize our data by its <em>density</em>, rather than displaying each individual datapoint.</p>
+<p><strong>Hex plots</strong> can be thought of as two-dimensional histograms that show the joint distribution between two variables. This is particularly useful when working with very dense data. In a hex plot, the x-y plane is binned into hexagons. Hexagons that are darker in color indicate a greater density of data – that is, there are more data points that lie in the region enclosed by the hexagon.</p>
+<p>We can generate a hex plot using <code>sns.jointplot</code> modified with the <code>kind</code> parameter.</p>
+<div id="c6d1a6c2" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>sns.jointplot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">"per capita: </span><span class="sc">% g</span><span class="st">rowth: 2016"</span>, <span class="op">\</span></span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>              y <span class="op">=</span> <span class="st">"Adult literacy rate: Female: % ages 15 and older: 2005-14"</span>, <span class="op">\</span></span>
+<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>              kind <span class="op">=</span> <span class="st">"hex"</span>)</span>
+<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a><span class="co"># plt.suptitle allows us to shift the title up so it does not overlap with the histogram</span></span>
+<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a>plt.suptitle(<span class="st">"Female adult literacy against </span><span class="sc">% g</span><span class="st">rowth"</span>)</span>
+<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a>plt.subplots_adjust(top<span class="op">=</span><span class="fl">0.9</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-19-output-1.png" width="570" height="569" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+<section id="contour-plots" class="level4" data-number="8.3.0.4">
+<h4 data-number="8.3.0.4" class="anchored" data-anchor-id="contour-plots"><span class="header-section-number">8.3.0.4</span> Contour Plots</h4>
+<p><strong>Contour plots</strong> are an alternative way of plotting the joint distribution of two variables. You can think of them as the 2-dimensional versions of KDE plots. A contour plot can be interpreted in a similar way to a <a href="https://gisgeography.com/contour-lines-topographic-map/">topographic map</a>. Each contour line represents an area that has the same <em>density</em> of datapoints throughout the region. Contours marked with darker colors contain more datapoints (a higher density) in that region.</p>
+<p><code>sns.kdeplot</code> will generate a contour plot if we specify both x and y data.</p>
+<div id="7fd8a0b6" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="19">
+<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>sns.kdeplot(data <span class="op">=</span> wb, x <span class="op">=</span> <span class="st">"per capita: </span><span class="sc">% g</span><span class="st">rowth: 2016"</span>, <span class="op">\</span></span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>            y <span class="op">=</span> <span class="st">"Adult literacy rate: Female: % ages 15 and older: 2005-14"</span>, <span class="op">\</span></span>
+<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>            fill <span class="op">=</span> <span class="va">True</span>)</span>
+<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Female adult literacy against </span><span class="sc">% g</span><span class="st">rowth"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-20-output-1.png" width="596" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="transformations" class="level2" data-number="8.4">
+<h2 data-number="8.4" class="anchored" data-anchor-id="transformations"><span class="header-section-number">8.4</span> Transformations</h2>
+<p>We have now covered visualizations in great depth, looking into various forms of visualizations, plotting libraries, and high-level theory.</p>
+<p>Much of this was done to uncover insights in data, which will prove necessary when we begin building models of data later in the course. A strong graphical correlation between two variables hints at an underlying relationship that we may want to study in greater detail. However, relying on visual relationships alone is limiting - not all plots show association. The presence of outliers and other statistical anomalies makes it hard to interpret data.</p>
+<p><strong>Transformations</strong> are the process of manipulating data to find significant relationships between variables. These are often found by applying mathematical functions to variables that “transform” their range of possible values and highlight some previously hidden associations between data.</p>
+<p>To see why we may want to transform data, consider the following plot of adult literacy rates against gross national income.</p>
+<div id="8a0082ee" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="20">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Some data cleaning to help with the next example</span></span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> pd.DataFrame(index<span class="op">=</span>wb.index)</span>
+<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'lit'</span>] <span class="op">=</span> wb[<span class="st">'Adult literacy rate: Female: % ages 15 and older: 2005-14'</span>] <span class="op">\</span></span>
+<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a>            <span class="op">+</span> wb[<span class="st">"Adult literacy rate: Male: % ages 15 and older: 2005-14"</span>]</span>
+<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'inc'</span>] <span class="op">=</span> wb[<span class="st">'gni'</span>]</span>
+<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>df.dropna(inplace<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a>plt.scatter(df[<span class="st">"inc"</span>], df[<span class="st">"lit"</span>])</span>
+<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Gross national income per capita"</span>)</span>
+<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Adult literacy rate"</span>)</span>
+<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Adult literacy rate against GNI per capita"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-21-output-1.png" width="593" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>This plot is difficult to interpret for two reasons:</p>
+<ul>
+<li>The data shown in the visualization appears almost “smushed” – it is heavily concentrated in the upper lefthand region of the plot. Even if we jittered the dataset, we likely would not be able to fully assess all datapoints in that area.</li>
+<li>It is hard to generalize a clear relationship between the two plotted variables. While adult literacy rate appears to share some positive relationship with gross national income, we are not able to describe the specifics of this trend in much detail.</li>
+</ul>
+<p>A transformation would allow us to visualize this data more clearly, which, in turn, would enable us to describe the underlying relationship between our variables of interest.</p>
+<p>We will most commonly apply a transformation to <strong>linearize a relationship</strong> between variables. If we find a transformation to make a scatter plot of two variables linear, we can “backtrack” to find the exact relationship between the variables. This helps us in two major ways. Firstly, linear relationships are particularly simple to interpret – we have an intuitive sense of what the slope and intercept of a linear trend represent, and how they can help us understand the relationship between two variables. Secondly, linear relationships are the backbone of linear models. We will begin exploring linear modeling in great detail next week. As we’ll soon see, linear models become much more effective when we are working with linearized data.</p>
+<p>In the remainder of this note, we will discuss how to linearize a dataset to produce the result below. Notice that the resulting plot displays a rough linear relationship between the values plotted on the x and y axes.</p>
+<p><img src="images/linearize.png" alt="linearize" width="800"></p>
+<section id="linearization-and-applying-transformations" class="level3" data-number="8.4.1">
+<h3 data-number="8.4.1" class="anchored" data-anchor-id="linearization-and-applying-transformations"><span class="header-section-number">8.4.1</span> Linearization and Applying Transformations</h3>
+<p>To linearize a relationship, begin by asking yourself: what makes the data non-linear? It is helpful to repeat this question for each variable in your visualization.</p>
+<p>Let’s start by considering the gross national income variable in our plot above. Looking at the y values in the scatter plot, we can see that many large y values are all clumped together, compressing the vertical axis. The scale of the horizontal axis is also being distorted by the few large outlying x values on the right.</p>
+<p><img src="images/horizontal.png" alt="horizontal" width="450"></p>
+<p>If we decreased the size of these outliers relative to the bulk of the data, we could reduce the distortion of the horizontal axis. How can we do this? We need a transformation that will:</p>
+<ul>
+<li>Decrease the magnitude of large x values by a significant amount.</li>
+<li>Not drastically change the magnitude of small x values.</li>
+</ul>
+<p>One function that produces this result is the <strong>log transformation</strong>. When we take the logarithm of a large number, the original number will decrease in magnitude dramatically. Conversely, when we take the logarithm of a small number, the original number does not change its value by as significant of an amount (to illustrate this, consider the difference between <span class="math inline">\(\log{(100)} = 4.61\)</span> and <span class="math inline">\(\log{(10)} = 2.3\)</span>).</p>
+<p>In Data 100 (and most upper-division STEM classes), <span class="math inline">\(\log\)</span> is used to refer to the natural logarithm with base <span class="math inline">\(e\)</span>.</p>
+<div id="c6d3a04c" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="21">
+<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="co"># np.log takes the logarithm of an array or Series</span></span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>plt.scatter(np.log(df[<span class="st">"inc"</span>]), df[<span class="st">"lit"</span>])</span>
+<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Log(gross national income per capita)"</span>)</span>
+<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Adult literacy rate"</span>)</span>
+<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="st">"Adult literacy rate against Log(GNI per capita)"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-22-output-1.png" width="593" height="449" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>After taking the logarithm of our x values, our plot appears much more balanced in its horizontal scale. We no longer have many datapoints clumped on one end and a few outliers out at extreme values.</p>
+<p>Let’s repeat this reasoning for the y values. Considering only the vertical axis of the plot, notice how there are many datapoints concentrated at large y values. Only a few datapoints lie at smaller values of y.</p>
+<p>If we were to “spread out” these large values of y more, we would no longer see the dense concentration in one region of the y-axis. We need a transformation that will:</p>
+<ul>
+<li>Increase the magnitude of large values of y so these datapoints are distributed more broadly on the vertical scale,</li>
+<li>Not substantially alter the scaling of small values of y (we do not want to drastically modify the lower end of the y axis, which is already distributed evenly on the vertical scale).</li>
+</ul>
+<p>In this case, it is helpful to apply a <strong>power transformation</strong> – that is, raise our y values to a power. Let’s try raising our adult literacy rate values to the power of 4. Large values raised to the power of 4 will increase in magnitude proportionally much more than small values raised to the power of 4 (consider the difference between <span class="math inline">\(2^4 = 16\)</span> and <span class="math inline">\(200^4 = 1600000000\)</span>).</p>
+<div id="998e92bc" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="22">
+<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply a log transformation to the x values and a power transformation to the y values</span></span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>plt.scatter(np.log(df[<span class="st">"inc"</span>]), df[<span class="st">"lit"</span>]<span class="op">**</span><span class="dv">4</span>)</span>
+<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Log(gross national income per capita)"</span>)</span>
+<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Adult literacy rate (4th power)"</span>)</span>
+<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a>plt.suptitle(<span class="st">"Adult literacy rate (4th power) against Log(GNI per capita)"</span>)</span>
+<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a>plt.subplots_adjust(top<span class="op">=</span><span class="fl">0.9</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-23-output-1.png" width="589" height="477" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Our scatter plot is looking a lot better! Now, we are plotting the log of our original x values on the horizontal axis, and the 4th power of our original y values on the vertical axis. We start to see an approximate <em>linear</em> relationship between our transformed variables.</p>
+<p>What can we take away from this? We now know that the log of gross national income and adult literacy to the power of 4 are roughly linearly related. If we denote the original, untransformed gross national income values as <span class="math inline">\(x\)</span> and the original adult literacy rate values as <span class="math inline">\(y\)</span>, we can use the standard form of a linear fit to express this relationship:</p>
+<p><span class="math display">\[y^4 = m(\log{x}) + b\]</span></p>
+<p>Where <span class="math inline">\(m\)</span> represents the slope of the linear fit, while <span class="math inline">\(b\)</span> represents the intercept.</p>
+<p>The cell below computes <span class="math inline">\(m\)</span> and <span class="math inline">\(b\)</span> for our transformed data. We’ll discuss how this code was generated in a future lecture.</p>
+<div id="6b869eeb" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="23">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="co"># The code below fits a linear regression model. We'll discuss it at length in a future lecture</span></span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
+<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a>model.fit(np.log(df[[<span class="st">"inc"</span>]]), df[<span class="st">"lit"</span>]<span class="op">**</span><span class="dv">4</span>)</span>
+<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a>m, b <span class="op">=</span> model.coef_[<span class="dv">0</span>], model.intercept_</span>
+<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb23-8"><a href="#cb23-8" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"The slope, m, of the transformed data is: </span><span class="sc">{</span>m<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb23-9"><a href="#cb23-9" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"The intercept, b, of the transformed data is: </span><span class="sc">{</span>b<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="cb23-10"><a href="#cb23-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb23-11"><a href="#cb23-11" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> df.sort_values(<span class="st">"inc"</span>)</span>
+<span id="cb23-12"><a href="#cb23-12" aria-hidden="true" tabindex="-1"></a>plt.scatter(np.log(df[<span class="st">"inc"</span>]), df[<span class="st">"lit"</span>]<span class="op">**</span><span class="dv">4</span>, label<span class="op">=</span><span class="st">"Transformed data"</span>)</span>
+<span id="cb23-13"><a href="#cb23-13" aria-hidden="true" tabindex="-1"></a>plt.plot(np.log(df[<span class="st">"inc"</span>]), m<span class="op">*</span>np.log(df[<span class="st">"inc"</span>])<span class="op">+</span>b, c<span class="op">=</span><span class="st">"red"</span>, label<span class="op">=</span><span class="st">"Linear regression"</span>)</span>
+<span id="cb23-14"><a href="#cb23-14" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Log(gross national income per capita)"</span>)</span>
+<span id="cb23-15"><a href="#cb23-15" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Adult literacy rate (4th power)"</span>)</span>
+<span id="cb23-16"><a href="#cb23-16" aria-hidden="true" tabindex="-1"></a>plt.legend()<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-stdout">
+<pre><code>The slope, m, of the transformed data is: 336400693.43172705
+The intercept, b, of the transformed data is: -1802204836.0479987</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-24-output-2.png" width="597" height="443" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>What if we want to understand the <em>underlying</em> relationship between our original variables, before they were transformed? We can simply rearrange our linear expression above!</p>
+<p>Recall our linear relationship between the transformed variables <span class="math inline">\(\log{x}\)</span> and <span class="math inline">\(y^4\)</span>.</p>
+<p><span class="math display">\[y^4 = m(\log{x}) + b\]</span></p>
+<p>By rearranging the equation, we find a relationship between the untransformed variables <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span>.</p>
+<p><span class="math display">\[y = [m(\log{x}) + b]^{(1/4)}\]</span></p>
+<p>When we plug in the values for <span class="math inline">\(m\)</span> and <span class="math inline">\(b\)</span> computed above, something interesting happens.</p>
+<div id="f13efd40" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="24">
+<details class="code-fold">
+<summary>Code</summary>
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Now, plug the values for m and b into the relationship between the untransformed x and y</span></span>
+<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a>plt.scatter(df[<span class="st">"inc"</span>], df[<span class="st">"lit"</span>], label<span class="op">=</span><span class="st">"Untransformed data"</span>)</span>
+<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a>plt.plot(df[<span class="st">"inc"</span>], (m<span class="op">*</span>np.log(df[<span class="st">"inc"</span>])<span class="op">+</span>b)<span class="op">**</span>(<span class="dv">1</span><span class="op">/</span><span class="dv">4</span>), c<span class="op">=</span><span class="st">"red"</span>, label<span class="op">=</span><span class="st">"Modeled relationship"</span>)</span>
+<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">"Gross national income per capita"</span>)</span>
+<span id="cb25-5"><a href="#cb25-5" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">"Adult literacy rate"</span>)</span>
+<span id="cb25-6"><a href="#cb25-6" aria-hidden="true" tabindex="-1"></a>plt.legend()<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details>
+<div class="cell-output cell-output-display">
+<div>
+<figure class="figure">
+<p><img src="visualization_2_files/figure-html/cell-25-output-1.png" width="593" height="429" class="figure-img"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>We have found a relationship between our original variables – gross national income and adult literacy rate!</p>
+<p>Transformations are powerful tools for understanding our data in greater detail. To summarize what we just achieved:</p>
+<ul>
+<li>We identified appropriate transformations to <strong>linearize</strong> the original data.</li>
+<li>We used our knowledge of linear curves to compute the slope and intercept of the transformed data.</li>
+<li>We used this slope and intercept information to derive a relationship in the untransformed data.</li>
+</ul>
+<p>Linearization will be an important tool as we begin our work on linear modeling next week.</p>
+<section id="tukey-mosteller-bulge-diagram" class="level4" data-number="8.4.1.1">
+<h4 data-number="8.4.1.1" class="anchored" data-anchor-id="tukey-mosteller-bulge-diagram"><span class="header-section-number">8.4.1.1</span> Tukey-Mosteller Bulge Diagram</h4>
+<p>The <strong>Tukey-Mosteller Bulge Diagram</strong> is a good guide when determining possible transformations to achieve linearity. It is a visual summary of the reasoning we just worked through above.</p>
+<p><img src="images/tukey_mosteller.png" alt="tukey_mosteller" width="300"></p>
+<p>How does it work? Each curved “bulge” represents a possible shape of non-linear data. To use the diagram, find which of the four bulges resembles your dataset the most closely. Then, look at the axes of the quadrant for this bulge. The horizontal axis will list possible transformations that could be applied to your x data for linearization. Similarly, the vertical axis will list possible transformations that could be applied to your y data. Note that each axis lists two possible transformations. While <em>either</em> of these transformations has the <em>potential</em> to linearize your dataset, note that this is an iterative process. It’s important to try out these transformations and look at the results to see whether you’ve actually achieved linearity. If not, you’ll need to continue testing other possible transformations.</p>
+<p>Generally:</p>
+<ul>
+<li><span class="math inline">\(\sqrt{}\)</span> and <span class="math inline">\(\log{}\)</span> will reduce the magnitude of large values.</li>
+<li>Powers (<span class="math inline">\(^2\)</span> and <span class="math inline">\(^3\)</span>) will increase the spread in magnitude of large values.</li>
+</ul>
+<center>
+<img src="images/bulge.png" alt="bulge" width="800">
+</center>
+<p><strong>Important:</strong> You should still understand the <em>logic</em> we worked through to determine how best to transform the data. The bulge diagram is just a summary of this same reasoning. You will be expected to be able to explain why a given transformation is or is not appropriate for linearization.</p>
+</section>
+</section>
+<section id="additional-remarks" class="level3" data-number="8.4.2">
+<h3 data-number="8.4.2" class="anchored" data-anchor-id="additional-remarks"><span class="header-section-number">8.4.2</span> Additional Remarks</h3>
+<p>Visualization requires a lot of thought!</p>
+<ul>
+<li>There are many tools for visualizing distributions.
+<ul>
+<li>Distribution of a single variable:
+<ol type="1">
+<li>Rugplot</li>
+<li>Histogram</li>
+<li>Density plot</li>
+<li>Box plot</li>
+<li>Violin plot</li>
+</ol></li>
+<li>Joint distribution of two quantitative variables:
+<ol type="1">
+<li>Scatter plot</li>
+<li>Hex plot</li>
+<li>Contour plot</li>
+</ol></li>
+</ul></li>
+</ul>
+<p>This class primarily uses <code>seaborn</code> and <code>matplotlib</code>, but <code>pandas</code> also has basic built-in plotting methods. Many other visualization libraries exist, and <code>plotly</code> is one of them.</p>
+<ul>
+<li><code>plotly</code> creates very easily creates interactive plots.</li>
+<li><code>plotly</code> will occasionally appear in lecture code, labs, and assignments!</li>
+</ul>
+<p>Next, we’ll go deeper into the theory behind visualization.</p>
+</section>
+</section>
+<section id="visualization-theory" class="level2" data-number="8.5">
+<h2 data-number="8.5" class="anchored" data-anchor-id="visualization-theory"><span class="header-section-number">8.5</span> Visualization Theory</h2>
+<p>This section marks a pivot to the second major topic of this lecture - visualization theory. We’ll discuss the abstract nature of visualizations and analyze how they convey information.</p>
+<p>Remember, we had two goals for visualizing data. This section is particularly important in:</p>
+<ol type="1">
+<li>Helping us understand the data and results,</li>
+<li>Communicating our results and conclusions with others.</li>
+</ol>
+<section id="information-channels" class="level3" data-number="8.5.1">
+<h3 data-number="8.5.1" class="anchored" data-anchor-id="information-channels"><span class="header-section-number">8.5.1</span> Information Channels</h3>
+<p>Visualizations are able to convey information through various encodings. In the remainder of this lecture, we’ll look at the use of color, scale, and depth, to name a few.</p>
+<section id="encodings-in-rugplots" class="level4" data-number="8.5.1.1">
+<h4 data-number="8.5.1.1" class="anchored" data-anchor-id="encodings-in-rugplots"><span class="header-section-number">8.5.1.1</span> Encodings in Rugplots</h4>
+<p>One detail that we may have overlooked in our earlier discussion of rugplots is the importance of encodings. Rugplots are effective visuals because they utilize line thickness to encode frequency. Consider the following diagram:</p>
+<p><img src="images/rugplot_encoding.png" alt="rugplot_encoding" width="600"></p>
+</section>
+<section id="multi-dimensional-encodings" class="level4" data-number="8.5.1.2">
+<h4 data-number="8.5.1.2" class="anchored" data-anchor-id="multi-dimensional-encodings"><span class="header-section-number">8.5.1.2</span> Multi-Dimensional Encodings</h4>
+<p>Encodings are also useful for representing multi-dimensional data. Notice how the following visual highlights four distinct “dimensions” of data:</p>
+<ul>
+<li>X-axis</li>
+<li>Y-axis</li>
+<li>Area</li>
+<li>Color</li>
+</ul>
+<p><img src="images/mutli_dim_encodings.png" alt="multi_dim_encoding" width="400"></p>
+<p>The human visual perception system is only capable of visualizing data in a three-dimensional plane, but as you’ve seen, we can encode many more channels of information.</p>
+</section>
+</section>
+<section id="harnessing-the-axes" class="level3" data-number="8.5.2">
+<h3 data-number="8.5.2" class="anchored" data-anchor-id="harnessing-the-axes"><span class="header-section-number">8.5.2</span> Harnessing the Axes</h3>
+<section id="consider-the-scale-of-the-data" class="level4" data-number="8.5.2.1">
+<h4 data-number="8.5.2.1" class="anchored" data-anchor-id="consider-the-scale-of-the-data"><span class="header-section-number">8.5.2.1</span> Consider the Scale of the Data</h4>
+<p>However, we should be careful to not misrepresent relationships in our data by manipulating the scale or axes. The visualization below improperly portrays two seemingly independent relationships on the same plot. The authors have clearly changed the scale of the y-axis to mislead their audience.</p>
+<p><img src="images/wrong_scale_viz.png" alt="wrong_scale_viz" width="350"></p>
+<p>Notice how the downwards-facing line segment contains values in the millions, while the upwards-trending segment only contains values near three hundred thousand. These lines should not be intersecting.</p>
+<p>When there is a large difference in the magnitude of the data, it’s advised to analyze percentages instead of counts. The following diagrams correctly display the trends in cancer screening and abortion rates.</p>
+<div class="columns">
+<div class="column" style="width:30%;">
+<p><img src="images/good_viz_scale_1.png" alt="good_viz_scale_1" width="345"></p>
+</div><div class="column" style="width:20%;">
+<!-- empty column to create gap -->
+</div><div class="column" style="width:30%;">
+<p><img src="images/good_viz_scale_2.png" alt="good_viz_scale_2" width="345"></p>
+</div>
+</div>
+</section>
+<section id="reveal-the-data" class="level4" data-number="8.5.2.2">
+<h4 data-number="8.5.2.2" class="anchored" data-anchor-id="reveal-the-data"><span class="header-section-number">8.5.2.2</span> Reveal the Data</h4>
+<p>Great visualizations not only consider the scale of the data but also utilize the axes in a way that best conveys information. For example, data scientists commonly set certain axes limits to highlight parts of the visualization they are most interested in.</p>
+<div class="columns">
+<div class="column" style="width:30%;">
+<p><img src="images/unrevealed_viz.png" alt="unrevealed_viz" width="345"></p>
+</div><div class="column" style="width:20%;">
+<!-- empty column to create gap -->
+</div><div class="column" style="width:30%;">
+<p><img src="images/revealed_viz.png" alt="revealed_viz" width="345"></p>
+</div>
+</div>
+<p>The visualization on the right captures the trend in coronavirus cases during March of 2020. From only looking at the visualization on the left, a viewer may incorrectly believe that coronavirus began to skyrocket on March 4<sup>th</sup>, 2020. However, the second illustration tells a different story - cases rose closer to March 21<sup>th</sup>, 2020.</p>
+</section>
+</section>
+<section id="harnessing-color" class="level3" data-number="8.5.3">
+<h3 data-number="8.5.3" class="anchored" data-anchor-id="harnessing-color"><span class="header-section-number">8.5.3</span> Harnessing Color</h3>
+<p>Color is another important feature in visualizations that does more than what meets the eye.</p>
+<p>We already explored using color to encode a categorical variable in our scatter plot. Let’s now discuss the uses of color in novel visualizations like colormaps and heatmaps.</p>
+<p>5-8% of the world is red-green color blind, so we have to be very particular about our color scheme. We want to make these as accessible as possible. Choosing a set of colors that work together is evidently a challenging task!</p>
+<section id="colormaps" class="level4" data-number="8.5.3.1">
+<h4 data-number="8.5.3.1" class="anchored" data-anchor-id="colormaps"><span class="header-section-number">8.5.3.1</span> Colormaps</h4>
+<p>Colormaps are mappings from pixel data to color values, and they’re often used to highlight distinct parts of an image. Let’s investigate a few properties of colormaps.</p>
+<div class="columns">
+<div class="column" style="width:30%;">
+<p><strong>Jet Colormap</strong> <img src="images/jet_colormap.png" alt="jet_colormap" width="250"></p>
+</div><div class="column" style="width:20%;">
+<!-- empty column to create gap -->
+</div><div class="column" style="width:30%;">
+<p><strong>Viridis Colormap</strong> <img src="images/viridis_colormap.png" alt="viridis_colormap" width="250"></p>
+</div>
+</div>
+<p>The jet colormap is infamous for being misleading. While it seems more vibrant than viridis, the aggressive colors poorly encode numerical data. To understand why, let’s analyze the following images.</p>
+<div class="columns">
+<div class="column" style="width:30%;">
+<p><img src="images/jet_four_by_four.png" alt="four_by_four_colormap" width="300"></p>
+</div><div class="column" style="width:20%;">
+<!-- empty column to create gap -->
+</div><div class="column" style="width:30%;">
+<p><img src="images/jet_3_images.png" alt="jet_3_colormap" width="250"></p>
+</div>
+</div>
+<p>The diagram on the left compares how a variety of colormaps represent pixel data that transitions from a high to low intensity. These include the jet colormap (row a) and grayscale (row b). Notice how the grayscale images do the best job in smoothly transitioning between pixel data. The jet colormap is the worst at this - the four images in row (a) look like a conglomeration of individual colors.</p>
+<p>The difference is also evident in the images labeled (a) and (b) on the left side. The grayscale image is better at preserving finer detail in the vertical line strokes. Additionally, grayscale is preferred in X-ray scans for being more neutral. The intensity of the dark red color in the jet colormap is frightening and indicates something is wrong.</p>
+<p>Why is the jet colormap so much worse? The answer lies in how its color composition is perceived to the human eye.</p>
+<div class="columns">
+<div class="column" style="width:30%;">
+<p><strong>Jet Colormap Perception</strong> <img src="images/jet_perceptually_uniform.png" alt="jet_perceptually_uniform" width="300"></p>
+</div><div class="column" style="width:20%;">
+<!-- empty column to create gap -->
+</div><div class="column" style="width:30%;">
+<p><strong>Viridis Colormap Perception</strong> <img src="images/viridis_perceptually_uniform.png" alt="viridis_perceptually_uniform" width="336"></p>
+</div>
+</div>
+<p>The jet colormap is largely misleading because it is not perceptually uniform. <strong>Perceptually uniform colormaps</strong> have the property that if the pixel data goes from 0.1 to 0.2, the perceptual change is the same as when the data goes from 0.8 to 0.9.</p>
+<p>Notice how the said uniformity is present within the linear trend displayed in the viridis colormap. On the other hand, the jet colormap is largely non-linear - this is precisely why it’s considered a worse colormap.</p>
+</section>
+</section>
+<section id="harnessing-markings" class="level3" data-number="8.5.4">
+<h3 data-number="8.5.4" class="anchored" data-anchor-id="harnessing-markings"><span class="header-section-number">8.5.4</span> Harnessing Markings</h3>
+<p>In our earlier discussion of multi-dimensional encodings, we analyzed a scatter plot with four pseudo-dimensions: the two axes, area, and color. Were these appropriate to use? The following diagram analyzes how well the human eye can distinguish between these “markings”.</p>
+<p><img src="images/markings_viz.png" alt="markings_viz" width="400"></p>
+<p>There are a few key takeaways from this diagram</p>
+<ul>
+<li>Lengths are easy to discern. Don’t use plots with jiggled baselines - keep everything axis-aligned.</li>
+<li>Avoid pie charts! Angle judgments are inaccurate.</li>
+<li>Areas and volumes are hard to distinguish (area charts, word clouds, etc.).</li>
+</ul>
+</section>
+<section id="harnessing-conditioning" class="level3" data-number="8.5.5">
+<h3 data-number="8.5.5" class="anchored" data-anchor-id="harnessing-conditioning"><span class="header-section-number">8.5.5</span> Harnessing Conditioning</h3>
+<p>Conditioning is the process of comparing data that belong to separate groups. We’ve seen this before in overlayed distributions, side-by-side box plots, and scatter plots with categorical encodings. Here, we’ll introduce terminology that formalizes these examples.</p>
+<p>Consider an example where we want to analyze income earnings for males and females with varying levels of education. There are multiple ways to compare this data.</p>
+<div class="columns">
+<div class="column" style="width:30%;">
+<p><img src="images/male_female_earnings_barplot.png" alt="jet_perceptually_uniform" width="360"></p>
+</div><div class="column" style="width:20%;">
+<!-- empty column to create gap -->
+</div><div class="column" style="width:30%;">
+<p><img src="images/male_female_earnings_scatterplot.png" alt="viridis_perceptually_uniform" width="340"></p>
+</div>
+</div>
+<p>The barplot is an example of <strong>juxtaposition</strong>: placing multiple plots side by side, with the same scale. The scatter plot is an example of <strong>superposition</strong>: placing multiple density curves and scatter plots on top of each other.</p>
+<p>Which is better depends on the problem at hand. Here, superposition makes the precise wage difference very clear from a quick glance. However, many sophisticated plots convey information that favors the use of juxtaposition. Below is one example.</p>
+<p><img src="images/small_multiples.png" alt="small_multiples" width="600"></p>
+</section>
+<section id="harnessing-context" class="level3" data-number="8.5.6">
+<h3 data-number="8.5.6" class="anchored" data-anchor-id="harnessing-context"><span class="header-section-number">8.5.6</span> Harnessing Context</h3>
+<p>The last component of a great visualization is perhaps the most critical - the use of context. Adding informative titles, axis labels, and descriptive captions are all best practices that we’ve heard repeatedly in Data 8.</p>
+<p>A publication-ready plot (and every Data 100 plot) needs:</p>
+<ul>
+<li>Informative title (takeaway, not description),</li>
+<li>Axis labels,</li>
+<li>Reference lines, markers, etc,</li>
+<li>Legends, if appropriate,</li>
+<li>Captions that describe data,</li>
+</ul>
+<p>Captions should:</p>
+<ul>
+<li>Be comprehensive and self-contained,</li>
+<li>Describe what has been graphed,</li>
+<li>Draw attention to important features,</li>
+<li>Describe conclusions drawn from graphs.</li>
+</ul>
+
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../visualization_1/visualization_1.html" class="pagination-link" aria-label="Visualization I">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Visualization I</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../sampling/sampling.html" class="pagination-link" aria-label="Sampling">
+        <span class="nav-page-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Sampling</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-10-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-10-output-1.png
new file mode 100644
index 000000000..8cdddb00c
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-10-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-11-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-11-output-1.png
new file mode 100644
index 000000000..3cc0010d9
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-11-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-12-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-12-output-1.png
new file mode 100644
index 000000000..431c8a1ad
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-12-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-13-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-13-output-1.png
new file mode 100644
index 000000000..5935e3a58
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-13-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-14-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-14-output-1.png
new file mode 100644
index 000000000..3b8074da9
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-14-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-15-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-15-output-1.png
new file mode 100644
index 000000000..ad5fbd481
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-15-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-16-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-16-output-1.png
new file mode 100644
index 000000000..9b7c35c43
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-16-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-17-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-17-output-1.png
new file mode 100644
index 000000000..55b19b0f5
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-17-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-18-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-18-output-1.png
new file mode 100644
index 000000000..ed584c158
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-18-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-19-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-19-output-1.png
new file mode 100644
index 000000000..8f44d2294
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-19-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-20-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-20-output-1.png
new file mode 100644
index 000000000..c8b7be606
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-20-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-21-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-21-output-1.png
new file mode 100644
index 000000000..c9c68e708
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-21-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-22-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-22-output-1.png
new file mode 100644
index 000000000..5086f1e6d
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-22-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-23-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-23-output-1.png
new file mode 100644
index 000000000..77da1e472
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-23-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-24-output-2.png b/docs/visualization_2/visualization_2_files/figure-html/cell-24-output-2.png
new file mode 100644
index 000000000..0b131dc78
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-24-output-2.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-25-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-25-output-1.png
new file mode 100644
index 000000000..7e8e8e7d0
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-25-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-3-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-3-output-1.png
new file mode 100644
index 000000000..fc5869003
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-3-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-4-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-4-output-1.png
new file mode 100644
index 000000000..b89461f12
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-4-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-5-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-5-output-1.png
new file mode 100644
index 000000000..4c1f4a18b
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-5-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-6-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-6-output-1.png
new file mode 100644
index 000000000..000a478c5
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-6-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-7-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-7-output-1.png
new file mode 100644
index 000000000..c53d86c08
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-7-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-8-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-8-output-1.png
new file mode 100644
index 000000000..8fbd02e62
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-8-output-1.png differ
diff --git a/docs/visualization_2/visualization_2_files/figure-html/cell-9-output-1.png b/docs/visualization_2/visualization_2_files/figure-html/cell-9-output-1.png
new file mode 100644
index 000000000..da50f6784
Binary files /dev/null and b/docs/visualization_2/visualization_2_files/figure-html/cell-9-output-1.png differ
diff --git a/index.aux b/index.aux
new file mode 100644
index 000000000..e5b9e15bc
--- /dev/null
+++ b/index.aux
@@ -0,0 +1,1263 @@
+\relax 
+\providecommand\hyper@newdestlabel[2]{}
+\providecommand*\new@tpo@label[2]{}
+\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
+\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
+\global\let\oldcontentsline\contentsline
+\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
+\global\let\oldnewlabel\newlabel
+\gdef\newlabel#1#2{\newlabelxx{#1}#2}
+\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
+\AtEndDocument{\ifx\hyper@anchor\@undefined
+\let\contentsline\oldcontentsline
+\let\newlabel\oldnewlabel
+\fi}
+\fi}
+\global\let\hyper@last\relax 
+\gdef\HyperFirstAtBeginDocument#1{#1}
+\providecommand*\HyPL@Entry[1]{}
+\HyPL@Entry{0<</S/D>>}
+\newlabel{welcome}{{}{3}{Welcome}{chapter*.2}{}}
+\@writefile{toc}{\contentsline {chapter}{Welcome}{3}{chapter*.2}\protected@file@percent }
+\newlabel{about-the-course-notes}{{}{3}{About the Course Notes}{section*.3}{}}
+\@writefile{toc}{\contentsline {section}{About the Course Notes}{3}{section*.3}\protected@file@percent }
+\@writefile{toc}{\contentsline {chapter}{\numberline {1}Introduction}{4}{chapter.1}\protected@file@percent }
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{lop}{\addvspace {10\p@ }}
+\newlabel{introduction}{{1}{4}{Introduction}{chapter.1}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.1}Data Science Lifecycle}{6}{section.1.1}\protected@file@percent }
+\newlabel{data-science-lifecycle}{{1.1}{6}{Data Science Lifecycle}{section.1.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.1.1}Ask a Question}{6}{subsection.1.1.1}\protected@file@percent }
+\newlabel{ask-a-question}{{1.1.1}{6}{Ask a Question}{subsection.1.1.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.1.2}Obtain Data}{6}{subsection.1.1.2}\protected@file@percent }
+\newlabel{obtain-data}{{1.1.2}{6}{Obtain Data}{subsection.1.1.2}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.1.3}Understand the Data}{7}{subsection.1.1.3}\protected@file@percent }
+\newlabel{understand-the-data}{{1.1.3}{7}{Understand the Data}{subsection.1.1.3}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.1.4}Understand the World}{8}{subsection.1.1.4}\protected@file@percent }
+\newlabel{understand-the-world}{{1.1.4}{8}{Understand the World}{subsection.1.1.4}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.2}Conclusion}{8}{section.1.2}\protected@file@percent }
+\newlabel{conclusion}{{1.2}{8}{Conclusion}{section.1.2}{}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {2}Pandas I}{9}{chapter.2}\protected@file@percent }
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{lop}{\addvspace {10\p@ }}
+\newlabel{pandas-i}{{2}{9}{Pandas I}{chapter.2}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.1}Tabular Data}{9}{section.2.1}\protected@file@percent }
+\newlabel{tabular-data}{{2.1}{9}{Tabular Data}{section.2.1}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.2}\texttt  {Series}, \texttt  {DataFrame}s, and Indices}{10}{section.2.2}\protected@file@percent }
+\newlabel{series-dataframes-and-indices}{{2.2}{10}{\texorpdfstring {\texttt {Series}, \texttt {DataFrame}s, and Indices}{Series, DataFrames, and Indices}}{section.2.2}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.1}Series}{10}{subsection.2.2.1}\protected@file@percent }
+\newlabel{series}{{2.2.1}{10}{Series}{subsection.2.2.1}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.1.1}Selection in \texttt  {Series}}{12}{subsubsection.2.2.1.1}\protected@file@percent }
+\newlabel{selection-in-series}{{2.2.1.1}{12}{\texorpdfstring {Selection in \texttt {Series}}{Selection in Series}}{subsubsection.2.2.1.1}{}}
+\@writefile{toc}{\contentsline {paragraph}{\numberline {2.2.1.1.1}A Single Label}{12}{paragraph.2.2.1.1.1}\protected@file@percent }
+\newlabel{a-single-label}{{2.2.1.1.1}{12}{A Single Label}{paragraph.2.2.1.1.1}{}}
+\@writefile{toc}{\contentsline {paragraph}{\numberline {2.2.1.1.2}A List of Labels}{12}{paragraph.2.2.1.1.2}\protected@file@percent }
+\newlabel{a-list-of-labels}{{2.2.1.1.2}{12}{A List of Labels}{paragraph.2.2.1.1.2}{}}
+\@writefile{toc}{\contentsline {paragraph}{\numberline {2.2.1.1.3}A Filtering Condition}{13}{paragraph.2.2.1.1.3}\protected@file@percent }
+\newlabel{a-filtering-condition}{{2.2.1.1.3}{13}{A Filtering Condition}{paragraph.2.2.1.1.3}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.2}\texttt  {DataFrames}}{13}{subsection.2.2.2}\protected@file@percent }
+\newlabel{dataframes}{{2.2.2}{13}{\texorpdfstring {\texttt {DataFrames}}{DataFrames}}{subsection.2.2.2}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.2.1}Creating a \texttt  {DataFrame}}{13}{subsubsection.2.2.2.1}\protected@file@percent }
+\newlabel{creating-a-dataframe}{{2.2.2.1}{13}{\texorpdfstring {Creating a \texttt {DataFrame}}{Creating a DataFrame}}{subsubsection.2.2.2.1}{}}
+\gdef \LT@i {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\@writefile{toc}{\contentsline {paragraph}{\numberline {2.2.2.1.1}From a CSV file}{14}{paragraph.2.2.2.1.1}\protected@file@percent }
+\newlabel{from-a-csv-file}{{2.2.2.1.1}{14}{From a CSV file}{paragraph.2.2.2.1.1}{}}
+\gdef \LT@ii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{48.97876pt}}
+\gdef \LT@iii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{50.66446pt}\LT@entry 
+    {1}{60.69525pt}}
+\@writefile{toc}{\contentsline {paragraph}{\numberline {2.2.2.1.2}Using a List and Column Name(s)}{15}{paragraph.2.2.2.1.2}\protected@file@percent }
+\newlabel{using-a-list-and-column-names}{{2.2.2.1.2}{15}{Using a List and Column Name(s)}{paragraph.2.2.2.1.2}{}}
+\@writefile{toc}{\contentsline {paragraph}{\numberline {2.2.2.1.3}From a Dictionary}{15}{paragraph.2.2.2.1.3}\protected@file@percent }
+\newlabel{from-a-dictionary}{{2.2.2.1.3}{15}{From a Dictionary}{paragraph.2.2.2.1.3}{}}
+\gdef \LT@iv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{65.33745pt}\LT@entry 
+    {1}{30.51706pt}}
+\gdef \LT@v {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{65.33745pt}\LT@entry 
+    {1}{30.51706pt}}
+\gdef \LT@vi {\LT@entry 
+    {3}{15.76741pt}\LT@entry 
+    {3}{16.95001pt}}
+\@writefile{toc}{\contentsline {paragraph}{\numberline {2.2.2.1.4}From a \texttt  {Series}}{16}{paragraph.2.2.2.1.4}\protected@file@percent }
+\newlabel{from-a-series}{{2.2.2.1.4}{16}{\texorpdfstring {From a \texttt {Series}}{From a Series}}{paragraph.2.2.2.1.4}{}}
+\gdef \LT@vii {\LT@entry 
+    {3}{15.76741pt}\LT@entry 
+    {3}{17.56322pt}}
+\gdef \LT@viii {\LT@entry 
+    {3}{15.76741pt}\LT@entry 
+    {1}{58.5375pt}\LT@entry 
+    {1}{52.0776pt}}
+\gdef \LT@ix {\LT@entry 
+    {3}{103.86014pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2.3}Indices}{17}{subsection.2.2.3}\protected@file@percent }
+\newlabel{indices}{{2.2.3}{17}{Indices}{subsection.2.2.3}{}}
+\gdef \LT@x {\LT@entry 
+    {3}{118.27034pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.3}\texttt  {DataFrame} Attributes: Index, Columns, and Shape}{19}{section.2.3}\protected@file@percent }
+\newlabel{dataframe-attributes-index-columns-and-shape}{{2.3}{19}{\texorpdfstring {\texttt {DataFrame} Attributes: Index, Columns, and Shape}{DataFrame Attributes: Index, Columns, and Shape}}{section.2.3}{}}
+\gdef \LT@xi {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.4}Slicing in \texttt  {DataFrame}s}{20}{section.2.4}\protected@file@percent }
+\newlabel{slicing-in-dataframes}{{2.4}{20}{\texorpdfstring {Slicing in \texttt {DataFrame}s}{Slicing in DataFrames}}{section.2.4}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.4.1}Extracting data with \texttt  {.head} and \texttt  {.tail}}{20}{subsection.2.4.1}\protected@file@percent }
+\newlabel{extracting-data-with-.head-and-.tail}{{2.4.1}{20}{\texorpdfstring {Extracting data with \texttt {.head} and \texttt {.tail}}{Extracting data with .head and .tail}}{subsection.2.4.1}{}}
+\gdef \LT@xii {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{92.99713pt}\LT@entry 
+    {3}{66.9252pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.4.2}Label-based Extraction: Indexing with \texttt  {.loc}}{21}{subsection.2.4.2}\protected@file@percent }
+\newlabel{label-based-extraction-indexing-with-.loc}{{2.4.2}{21}{\texorpdfstring {Label-based Extraction: Indexing with \texttt {.loc}}{Label-based Extraction: Indexing with .loc}}{subsection.2.4.2}{}}
+\gdef \LT@xiii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{67.33095pt}}
+\gdef \LT@xiv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\gdef \LT@xv {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {1}{36.62715pt}}
+\gdef \LT@xvi {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{67.33095pt}}
+\gdef \LT@xvii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\gdef \LT@xviii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{67.33095pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.4.3}Integer-based Extraction: Indexing with \texttt  {.iloc}}{24}{subsection.2.4.3}\protected@file@percent }
+\newlabel{integer-based-extraction-indexing-with-.iloc}{{2.4.3}{24}{\texorpdfstring {Integer-based Extraction: Indexing with \texttt {.iloc}}{Integer-based Extraction: Indexing with .iloc}}{subsection.2.4.3}{}}
+\gdef \LT@xix {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{67.33095pt}}
+\gdef \LT@xx {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{118.27034pt}}
+\gdef \LT@xxi {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\gdef \LT@xxii {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{67.33095pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.4.4}Context-dependent Extraction: Indexing with \texttt  {{[}{]}}}{26}{subsection.2.4.4}\protected@file@percent }
+\newlabel{context-dependent-extraction-indexing-with}{{2.4.4}{26}{\texorpdfstring {Context-dependent Extraction: Indexing with \texttt {{[}{]}}}{Context-dependent Extraction: Indexing with {[}{]}}}{subsection.2.4.4}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.4.4.1}A slice of row numbers}{26}{subsubsection.2.4.4.1}\protected@file@percent }
+\newlabel{a-slice-of-row-numbers}{{2.4.4.1}{26}{A slice of row numbers}{subsubsection.2.4.4.1}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.4.4.2}A list of column labels}{26}{subsubsection.2.4.4.2}\protected@file@percent }
+\newlabel{a-list-of-column-labels}{{2.4.4.2}{26}{A list of column labels}{subsubsection.2.4.4.2}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.4.4.3}A single-column label}{27}{subsubsection.2.4.4.3}\protected@file@percent }
+\newlabel{a-single-column-label}{{2.4.4.3}{27}{A single-column label}{subsubsection.2.4.4.3}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {2.5}Parting Note}{27}{section.2.5}\protected@file@percent }
+\newlabel{parting-note}{{2.5}{27}{Parting Note}{section.2.5}{}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {3}Pandas II}{28}{chapter.3}\protected@file@percent }
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{lop}{\addvspace {10\p@ }}
+\newlabel{pandas-ii}{{3}{28}{Pandas II}{chapter.3}{}}
+\gdef \LT@xxiii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxiv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{51.43095pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.1}Conditional Selection}{29}{section.3.1}\protected@file@percent }
+\newlabel{conditional-selection}{{3.1}{29}{Conditional Selection}{section.3.1}{}}
+\gdef \LT@xxv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{51.43095pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxvi {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxvii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxviii {\LT@entry 
+    {1}{41.59846pt}\LT@entry 
+    {1}{40.33861pt}\LT@entry 
+    {3}{117.55858pt}}
+\gdef \LT@xxix {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxx {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxxi {\LT@entry 
+    {3}{33.37502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {1}{39.67065pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxxii {\LT@entry 
+    {3}{33.37502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {1}{39.67065pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxxiii {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{44.57626pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxxiv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{41.51025pt}\LT@entry 
+    {1}{73.89pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.2}Adding, Removing, and Modifying Columns}{34}{section.3.2}\protected@file@percent }
+\newlabel{adding-removing-and-modifying-columns}{{3.2}{34}{Adding, Removing, and Modifying Columns}{section.3.2}{}}
+\gdef \LT@xxxv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{41.51025pt}\LT@entry 
+    {1}{73.89pt}}
+\gdef \LT@xxxvi {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{41.51025pt}\LT@entry 
+    {1}{39.61652pt}}
+\gdef \LT@xxxvii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xxxviii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.3}Useful Utility Functions}{36}{section.3.3}\protected@file@percent }
+\newlabel{useful-utility-functions}{{3.3}{36}{Useful Utility Functions}{section.3.3}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.1}\texttt  {NumPy}}{37}{subsection.3.3.1}\protected@file@percent }
+\newlabel{numpy}{{3.3.1}{37}{\texorpdfstring {\texttt {NumPy}}{NumPy}}{subsection.3.3.1}{}}
+\gdef \LT@xxxix {\LT@entry 
+    {3}{32.46616pt}\LT@entry 
+    {3}{80.7441pt}\LT@entry 
+    {3}{74.7441pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.2}\texttt  {.shape} and \texttt  {.size}}{38}{subsection.3.3.2}\protected@file@percent }
+\newlabel{shape-and-.size}{{3.3.2}{38}{\texorpdfstring {\texttt {.shape} and \texttt {.size}}{.shape and .size}}{subsection.3.3.2}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.3}\texttt  {.describe()}}{38}{subsection.3.3.3}\protected@file@percent }
+\newlabel{describe}{{3.3.3}{38}{\texorpdfstring {\texttt {.describe()}}{.describe()}}{subsection.3.3.3}{}}
+\gdef \LT@xl {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {1}{39.67065pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xli {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{63.87015pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.4}\texttt  {.sample()}}{39}{subsection.3.3.4}\protected@file@percent }
+\newlabel{sample}{{3.3.4}{39}{\texorpdfstring {\texttt {.sample()}}{.sample()}}{subsection.3.3.4}{}}
+\gdef \LT@xlii {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{52.24126pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.5}\texttt  {.value\_counts()}}{40}{subsection.3.3.5}\protected@file@percent }
+\newlabel{value_counts}{{3.3.5}{40}{\texorpdfstring {\texttt {.value\_counts()}}{.value\_counts()}}{subsection.3.3.5}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.6}\texttt  {.unique()}}{40}{subsection.3.3.6}\protected@file@percent }
+\newlabel{unique}{{3.3.6}{40}{\texorpdfstring {\texttt {.unique()}}{.unique()}}{subsection.3.3.6}{}}
+\gdef \LT@xliii {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{49.10956pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.7}\texttt  {.sort\_values()}}{41}{subsection.3.3.7}\protected@file@percent }
+\newlabel{sort_values}{{3.3.7}{41}{\texorpdfstring {\texttt {.sort\_values()}}{.sort\_values()}}{subsection.3.3.7}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.4}Parting Note}{41}{section.3.4}\protected@file@percent }
+\newlabel{parting-note-1}{{3.4}{41}{Parting Note}{section.3.4}{}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {4}Pandas III}{42}{chapter.4}\protected@file@percent }
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{lop}{\addvspace {10\p@ }}
+\newlabel{pandas-iii}{{4}{42}{Pandas III}{chapter.4}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.1}Custom Sorts}{42}{section.4.1}\protected@file@percent }
+\newlabel{custom-sorts}{{4.1}{42}{Custom Sorts}{section.4.1}{}}
+\gdef \LT@xliv {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{47.62036pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xlv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{56.1723pt}\LT@entry 
+    {1}{41.51025pt}\LT@entry 
+    {1}{73.89pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.1.1}Approach 1: Create a Temporary Column}{43}{subsection.4.1.1}\protected@file@percent }
+\newlabel{approach-1-create-a-temporary-column}{{4.1.1}{43}{Approach 1: Create a Temporary Column}{subsection.4.1.1}{}}
+\gdef \LT@xlvi {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{90.15015pt}\LT@entry 
+    {1}{41.51025pt}\LT@entry 
+    {1}{73.89pt}}
+\gdef \LT@xlvii {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{90.15015pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.1.2}Approach 2: Sorting using the \texttt  {key} Argument}{44}{subsection.4.1.2}\protected@file@percent }
+\newlabel{approach-2-sorting-using-the-key-argument}{{4.1.2}{44}{\texorpdfstring {Approach 2: Sorting using the \texttt {key} Argument}{Approach 2: Sorting using the key Argument}}{subsection.4.1.2}{}}
+\gdef \LT@xlviii {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{90.15015pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@xlix {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{57.50821pt}\LT@entry 
+    {1}{41.51025pt}\LT@entry 
+    {1}{69.60855pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.1.3}Approach 3: Sorting using the \texttt  {map} Function}{45}{subsection.4.1.3}\protected@file@percent }
+\newlabel{approach-3-sorting-using-the-map-function}{{4.1.3}{45}{\texorpdfstring {Approach 3: Sorting using the \texttt {map} Function}{Approach 3: Sorting using the map Function}}{subsection.4.1.3}{}}
+\gdef \LT@l {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{57.50821pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.2}Aggregating Data with \texttt  {.groupby}}{46}{section.4.2}\protected@file@percent }
+\newlabel{aggregating-data-with-.groupby}{{4.2}{46}{\texorpdfstring {Aggregating Data with \texttt {.groupby}}{Aggregating Data with .groupby}}{section.4.2}{}}
+\gdef \LT@li {\LT@entry 
+    {1}{27.93286pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Performing an aggregation\relax }}{47}{figure.caption.4}\protected@file@percent }
+\gdef \LT@lii {\LT@entry 
+    {1}{27.93286pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@liii {\LT@entry 
+    {1}{27.93286pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@liv {\LT@entry 
+    {1}{27.93286pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@lv {\LT@entry 
+    {3}{45.9456pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Aggregation Functions}{49}{subsection.4.2.1}\protected@file@percent }
+\newlabel{aggregation-functions}{{4.2.1}{49}{Aggregation Functions}{subsection.4.2.1}{}}
+\gdef \LT@lvi {\LT@entry 
+    {3}{45.9456pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@lvii {\LT@entry 
+    {3}{45.9456pt}\LT@entry 
+    {3}{52.8441pt}}
+\gdef \LT@lviii {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {3}{57.50821pt}\LT@entry 
+    {1}{68.0859pt}\LT@entry 
+    {1}{27.93286pt}}
+\gdef \LT@lix {\LT@entry 
+    {3}{45.9456pt}\LT@entry 
+    {1}{68.0859pt}\LT@entry 
+    {1}{27.93286pt}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Aggregating using ``first''\relax }}{52}{figure.caption.5}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}Plotting Birth Counts}{52}{subsection.4.2.2}\protected@file@percent }
+\newlabel{plotting-birth-counts}{{4.2.2}{52}{Plotting Birth Counts}{subsection.4.2.2}{}}
+\gdef \LT@lx {\LT@entry 
+    {1}{27.93286pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.3}Summary of the \texttt  {.groupby()} Function}{53}{subsection.4.2.3}\protected@file@percent }
+\newlabel{summary-of-the-.groupby-function}{{4.2.3}{53}{\texorpdfstring {Summary of the \texttt {.groupby()} Function}{Summary of the .groupby() Function}}{subsection.4.2.3}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.4}Revisiting the \texttt  {.agg()} Function}{54}{subsection.4.2.4}\protected@file@percent }
+\newlabel{revisiting-the-.agg-function}{{4.2.4}{54}{\texorpdfstring {Revisiting the \texttt {.agg()} Function}{Revisiting the .agg() Function}}{subsection.4.2.4}{}}
+\gdef \LT@lxi {\LT@entry 
+    {3}{44.67542pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{47.36911pt}}
+\gdef \LT@lxii {\LT@entry 
+    {3}{44.67542pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {1}{61.6698pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.5}Nuisance Columns}{55}{subsection.4.2.5}\protected@file@percent }
+\newlabel{nuisance-columns}{{4.2.5}{55}{Nuisance Columns}{subsection.4.2.5}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.6}Renaming Columns After Grouping}{55}{subsection.4.2.6}\protected@file@percent }
+\newlabel{renaming-columns-after-grouping}{{4.2.6}{55}{Renaming Columns After Grouping}{subsection.4.2.6}{}}
+\gdef \LT@lxiii {\LT@entry 
+    {3}{42.18976pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {1}{61.6698pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.7}Some Data Science Payoff}{56}{subsection.4.2.7}\protected@file@percent }
+\newlabel{some-data-science-payoff}{{4.2.7}{56}{Some Data Science Payoff}{subsection.4.2.7}{}}
+\gdef \LT@lxiv {\LT@entry 
+    {3}{45.9456pt}\LT@entry 
+    {1}{35.51025pt}}
+\gdef \LT@lxv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.3}\texttt  {.groupby()}, Continued}{57}{section.4.3}\protected@file@percent }
+\newlabel{groupby-continued}{{4.3}{57}{\texorpdfstring {\texttt {.groupby()}, Continued}{.groupby(), Continued}}{section.4.3}{}}
+\gdef \LT@lxvi {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{92.69055pt}\LT@entry 
+    {3}{51.91275pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{47.36911pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.3.1}Raw \texttt  {GroupBy} Objects}{58}{subsection.4.3.1}\protected@file@percent }
+\newlabel{raw-groupby-objects}{{4.3.1}{58}{\texorpdfstring {Raw \texttt {GroupBy} Objects}{Raw GroupBy Objects}}{subsection.4.3.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.3.2}Other \texttt  {GroupBy} Methods}{58}{subsection.4.3.2}\protected@file@percent }
+\newlabel{other-groupby-methods}{{4.3.2}{58}{\texorpdfstring {Other \texttt {GroupBy} Methods}{Other GroupBy Methods}}{subsection.4.3.2}{}}
+\gdef \LT@lxvii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{37.57921pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {1}{29.17021pt}}
+\gdef \LT@lxviii {\LT@entry 
+    {1}{31.57921pt}\LT@entry 
+    {1}{32.99117pt}\LT@entry 
+    {1}{29.17021pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.3.3}Filtering by Group}{60}{subsection.4.3.3}\protected@file@percent }
+\newlabel{filtering-by-group}{{4.3.3}{60}{Filtering by Group}{subsection.4.3.3}{}}
+\gdef \LT@lxix {\LT@entry 
+    {3}{16.95001pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{113.65979pt}\LT@entry 
+    {3}{114.14159pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\gdef \LT@lxx {\LT@entry 
+    {3}{118.27034pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{111.76544pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.3.4}Aggregation with \texttt  {lambda} Functions}{62}{subsection.4.3.4}\protected@file@percent }
+\newlabel{aggregation-with-lambda-functions}{{4.3.4}{62}{\texorpdfstring {Aggregation with \texttt {lambda} Functions}{Aggregation with lambda Functions}}{subsection.4.3.4}{}}
+\gdef \LT@lxxi {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{101.83379pt}\LT@entry 
+    {3}{66.9252pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\gdef \LT@lxxii {\LT@entry 
+    {3}{118.27034pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{96.50114pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\gdef \LT@lxxiii {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{95.58134pt}\LT@entry 
+    {3}{120.93059pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\gdef \LT@lxxiv {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{113.97734pt}\LT@entry 
+    {3}{78.8826pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{47.36911pt}}
+\gdef \LT@lxxv {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{57.50821pt}\LT@entry 
+    {1}{41.51025pt}\LT@entry 
+    {1}{62.0859pt}}
+\gdef \LT@lxxvi {\LT@entry 
+    {3}{452.76006pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{35.51025pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.4}Aggregating Data with Pivot Tables}{65}{section.4.4}\protected@file@percent }
+\newlabel{aggregating-data-with-pivot-tables}{{4.4}{65}{Aggregating Data with Pivot Tables}{section.4.4}{}}
+\gdef \LT@lxxvii {\LT@entry 
+    {1}{27.93286pt}\LT@entry 
+    {3}{39.37502pt}\LT@entry 
+    {3}{33.37502pt}}
+\gdef \LT@lxxviii {\LT@entry 
+    {1}{27.93286pt}\LT@entry 
+    {3}{28.42502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {3}{47.29185pt}\LT@entry 
+    {3}{51.36586pt}}
+\gdef \LT@lxxix {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{52.8441pt}}
+\gdef \LT@lxxx {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{58.8441pt}\LT@entry 
+    {1}{60.3777pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.5}Joining Tables}{68}{section.4.5}\protected@file@percent }
+\newlabel{joining-tables}{{4.5}{68}{Joining Tables}{section.4.5}{}}
+\gdef \LT@lxxxi {\LT@entry 
+    {3}{38.85pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{52.0332pt}\LT@entry 
+    {1}{41.51025pt}\LT@entry 
+    {1}{62.0859pt}}
+\gdef \LT@lxxxii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{47.92696pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{58.8441pt}\LT@entry 
+    {1}{66.3777pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{28.73161pt}\LT@entry 
+    {1}{47.92696pt}\LT@entry 
+    {3}{49.449pt}\LT@entry 
+    {1}{41.51025pt}\LT@entry 
+    {1}{62.0859pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.6}Parting Note}{69}{section.4.6}\protected@file@percent }
+\newlabel{parting-note-2}{{4.6}{69}{Parting Note}{section.4.6}{}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {5}Data Cleaning and EDA}{71}{chapter.5}\protected@file@percent }
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{lop}{\addvspace {10\p@ }}
+\newlabel{data-cleaning-and-eda}{{5}{71}{Data Cleaning and EDA}{chapter.5}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {5.1}Structure}{72}{section.5.1}\protected@file@percent }
+\newlabel{structure}{{5.1}{72}{Structure}{section.5.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.1.1}File Formats}{72}{subsection.5.1.1}\protected@file@percent }
+\newlabel{file-formats}{{5.1.1}{72}{File Formats}{subsection.5.1.1}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {5.1.1.1}CSV}{72}{subsubsection.5.1.1.1}\protected@file@percent }
+\newlabel{csv}{{5.1.1.1}{72}{CSV}{subsubsection.5.1.1.1}{}}
+\gdef \LT@lxxxiii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{30.9441pt}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {5.1.1.2}TSV}{73}{subsubsection.5.1.1.2}\protected@file@percent }
+\newlabel{tsv}{{5.1.1.2}{73}{TSV}{subsubsection.5.1.1.2}{}}
+\gdef \LT@lxxxiv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{30.9441pt}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {5.1.1.3}JSON}{74}{subsubsection.5.1.1.3}\protected@file@percent }
+\newlabel{json}{{5.1.1.3}{74}{JSON}{subsubsection.5.1.1.3}{}}
+\gdef \LT@lxxxv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{33.93286pt}\LT@entry 
+    {3}{109.86014pt}\LT@entry 
+    {3}{124.27034pt}\LT@entry 
+    {1}{73.33095pt}\LT@entry 
+    {1}{42.62715pt}\LT@entry 
+    {3}{30.9441pt}}
+\@writefile{toc}{\contentsline {paragraph}{\numberline {5.1.1.3.1}EDA with JSON: Berkeley COVID-19 Data}{75}{paragraph.5.1.1.3.1}\protected@file@percent }
+\newlabel{eda-with-json-berkeley-covid-19-data}{{5.1.1.3.1}{75}{EDA with JSON: Berkeley COVID-19 Data}{paragraph.5.1.1.3.1}{}}
+\@writefile{toc}{\contentsline {subparagraph}{\numberline {5.1.1.3.1.1}File Size}{76}{subparagraph.5.1.1.3.1.1}\protected@file@percent }
+\newlabel{file-size}{{5.1.1.3.1.1}{76}{File Size}{subparagraph.5.1.1.3.1.1}{}}
+\@writefile{toc}{\contentsline {subparagraph}{\numberline {5.1.1.3.1.2}Unix Commands}{76}{subparagraph.5.1.1.3.1.2}\protected@file@percent }
+\newlabel{unix-commands}{{5.1.1.3.1.2}{76}{Unix Commands}{subparagraph.5.1.1.3.1.2}{}}
+\@writefile{toc}{\contentsline {subparagraph}{\numberline {5.1.1.3.1.3}File Contents}{77}{subparagraph.5.1.1.3.1.3}\protected@file@percent }
+\newlabel{file-contents}{{5.1.1.3.1.3}{77}{File Contents}{subparagraph.5.1.1.3.1.3}{}}
+\@writefile{toc}{\contentsline {subparagraph}{\numberline {5.1.1.3.1.4}Examining the Data Field for Records}{79}{subparagraph.5.1.1.3.1.4}\protected@file@percent }
+\newlabel{examining-the-data-field-for-records}{{5.1.1.3.1.4}{79}{Examining the Data Field for Records}{subparagraph.5.1.1.3.1.4}{}}
+\gdef \LT@lxxxvi {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {3}{115.14899pt}\LT@entry 
+    {3}{221.25446pt}\LT@entry 
+    {1}{50.09506pt}\LT@entry 
+    {3}{66.75pt}\LT@entry 
+    {1}{78.63075pt}\LT@entry 
+    {1}{69.2028pt}\LT@entry 
+    {1}{83.18594pt}\LT@entry 
+    {1}{35.71771pt}\LT@entry 
+    {3}{109.9368pt}\LT@entry 
+    {1}{63.49785pt}\LT@entry 
+    {1}{91.57425pt}}
+\@writefile{toc}{\contentsline {subparagraph}{\numberline {5.1.1.3.1.5}Summary of exploring the JSON file}{80}{subparagraph.5.1.1.3.1.5}\protected@file@percent }
+\newlabel{summary-of-exploring-the-json-file}{{5.1.1.3.1.5}{80}{Summary of exploring the JSON file}{subparagraph.5.1.1.3.1.5}{}}
+\@writefile{toc}{\contentsline {subparagraph}{\numberline {5.1.1.3.1.6}Loading COVID Data into \texttt  {pandas}}{80}{subparagraph.5.1.1.3.1.6}\protected@file@percent }
+\newlabel{loading-covid-data-into-pandas}{{5.1.1.3.1.6}{80}{\texorpdfstring {Loading COVID Data into \texttt {pandas}}{Loading COVID Data into pandas}}{subparagraph.5.1.1.3.1.6}{}}
+\gdef \LT@lxxxvii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{66.75pt}\LT@entry 
+    {1}{39.67065pt}\LT@entry 
+    {3}{92.40645pt}}
+\gdef \LT@lxxxviii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{70.6044pt}\LT@entry 
+    {3}{66.75pt}\LT@entry 
+    {3}{52.23091pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.1.2}Primary and Foreign Keys}{81}{subsection.5.1.2}\protected@file@percent }
+\newlabel{primary-and-foreign-keys}{{5.1.2}{81}{Primary and Foreign Keys}{subsection.5.1.2}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.1.3}Variable Types}{81}{subsection.5.1.3}\protected@file@percent }
+\newlabel{variable-types}{{5.1.3}{81}{Variable Types}{subsection.5.1.3}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {5.1}{\ignorespaces Classification of variable types\relax }}{82}{figure.caption.6}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {5.2}Granularity, Scope, and Temporality}{83}{section.5.2}\protected@file@percent }
+\newlabel{granularity-scope-and-temporality}{{5.2}{83}{Granularity, Scope, and Temporality}{section.5.2}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.1}Granularity}{83}{subsection.5.2.1}\protected@file@percent }
+\newlabel{granularity}{{5.2.1}{83}{Granularity}{subsection.5.2.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.2}Scope}{83}{subsection.5.2.2}\protected@file@percent }
+\newlabel{scope}{{5.2.2}{83}{Scope}{subsection.5.2.2}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.3}Temporality}{83}{subsection.5.2.3}\protected@file@percent }
+\newlabel{temporality}{{5.2.3}{83}{Temporality}{subsection.5.2.3}{}}
+\gdef \LT@lxxxix {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{58.39516pt}\LT@entry 
+    {3}{170.32603pt}\LT@entry 
+    {3}{131.23453pt}\LT@entry 
+    {1}{69.19185pt}\LT@entry 
+    {3}{135.71307pt}\LT@entry 
+    {1}{55.95331pt}\LT@entry 
+    {3}{131.23453pt}\LT@entry 
+    {3}{294.22523pt}\LT@entry 
+    {3}{153.50682pt}\LT@entry 
+    {3}{52.9311pt}\LT@entry 
+    {1}{30.9441pt}}
+\gdef \LT@xc {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{58.39516pt}\LT@entry 
+    {3}{170.32603pt}\LT@entry 
+    {1}{67.5165pt}\LT@entry 
+    {1}{69.19185pt}\LT@entry 
+    {3}{135.71307pt}\LT@entry 
+    {1}{55.95331pt}\LT@entry 
+    {3}{131.23453pt}\LT@entry 
+    {3}{294.22523pt}\LT@entry 
+    {3}{153.50682pt}\LT@entry 
+    {3}{52.9311pt}\LT@entry 
+    {1}{30.9441pt}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {5.2.3.1}Temporality with \texttt  {pandas}' \texttt  {dt} accessors}{84}{subsubsection.5.2.3.1}\protected@file@percent }
+\newlabel{temporality-with-pandas-dt-accessors}{{5.2.3.1}{84}{\texorpdfstring {Temporality with \texttt {pandas}' \texttt {dt} accessors}{Temporality with pandas' dt accessors}}{subsubsection.5.2.3.1}{}}
+\gdef \LT@xci {\LT@entry 
+    {3}{27.90001pt}\LT@entry 
+    {1}{58.39516pt}\LT@entry 
+    {3}{163.78888pt}\LT@entry 
+    {1}{67.5165pt}\LT@entry 
+    {1}{69.19185pt}\LT@entry 
+    {3}{164.68677pt}\LT@entry 
+    {1}{55.95331pt}\LT@entry 
+    {3}{131.23453pt}\LT@entry 
+    {3}{298.86804pt}\LT@entry 
+    {3}{169.09961pt}\LT@entry 
+    {3}{52.9311pt}\LT@entry 
+    {1}{30.9441pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {5.3}Faithfulness}{86}{section.5.3}\protected@file@percent }
+\newlabel{faithfulness}{{5.3}{86}{Faithfulness}{section.5.3}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.1}Missing Values}{86}{subsection.5.3.1}\protected@file@percent }
+\newlabel{missing-values}{{5.3.1}{86}{Missing Values}{subsection.5.3.1}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {5.4}EDA Demo 1: Tuberculosis in the United States}{87}{section.5.4}\protected@file@percent }
+\newlabel{eda-demo-1-tuberculosis-in-the-united-states}{{5.4}{87}{EDA Demo 1: Tuberculosis in the United States}{section.5.4}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.1}CSVs and Field Names}{87}{subsection.5.4.1}\protected@file@percent }
+\newlabel{csvs-and-field-names}{{5.4.1}{87}{CSVs and Field Names}{subsection.5.4.1}{}}
+\gdef \LT@xcii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{89.9859pt}\LT@entry 
+    {1}{89.19748pt}\LT@entry 
+    {1}{71.31615pt}\LT@entry 
+    {1}{71.31615pt}\LT@entry 
+    {1}{75.10484pt}\LT@entry 
+    {1}{71.31615pt}\LT@entry 
+    {1}{65.31615pt}}
+\gdef \LT@xciii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{89.9859pt}\LT@entry 
+    {3}{36.9441pt}\LT@entry 
+    {3}{36.9441pt}\LT@entry 
+    {3}{36.9441pt}\LT@entry 
+    {1}{42.41911pt}\LT@entry 
+    {1}{42.41911pt}\LT@entry 
+    {1}{36.41911pt}}
+\gdef \LT@xciv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{89.9859pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{94.65118pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.2}Record Granularity}{90}{subsection.5.4.2}\protected@file@percent }
+\newlabel{record-granularity}{{5.4.2}{90}{Record Granularity}{subsection.5.4.2}{}}
+\gdef \LT@xcv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{89.9859pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{94.65118pt}}
+\gdef \LT@xcvi {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{89.9859pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{94.65118pt}}
+\gdef \LT@xcvii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{92.74529pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{55.27501pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.3}Gather Census Data}{92}{subsection.5.4.3}\protected@file@percent }
+\newlabel{gather-census-data}{{5.4.3}{92}{Gather Census Data}{subsection.5.4.3}{}}
+\gdef \LT@xcviii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{92.74529pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{55.27501pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.4}Joining Data (Merging \texttt  {DataFrame}s)}{93}{subsection.5.4.4}\protected@file@percent }
+\newlabel{joining-data-merging-dataframes}{{5.4.4}{93}{\texorpdfstring {Joining Data (Merging \texttt {DataFrame}s)}{Joining Data (Merging DataFrames)}}{subsection.5.4.4}{}}
+\gdef \LT@xcix {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{89.9859pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{106.73938pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {1}{106.73938pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{49.8pt}}
+\gdef \LT@c {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{89.9859pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{49.8pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.5}Reproducing Data: Compute Incidence}{94}{subsection.5.4.5}\protected@file@percent }
+\newlabel{reproducing-data-compute-incidence}{{5.4.5}{94}{Reproducing Data: Compute Incidence}{subsection.5.4.5}{}}
+\gdef \LT@ci {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{89.9859pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {1}{128.90279pt}}
+\gdef \LT@cii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {1}{89.9859pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {3}{55.8pt}\LT@entry 
+    {1}{134.90279pt}\LT@entry 
+    {1}{134.90279pt}\LT@entry 
+    {1}{128.90279pt}}
+\gdef \LT@ciii {\LT@entry 
+    {3}{32.46616pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {3}{69.7941pt}\LT@entry 
+    {3}{69.7941pt}\LT@entry 
+    {3}{69.7941pt}\LT@entry 
+    {1}{134.90279pt}\LT@entry 
+    {1}{134.90279pt}\LT@entry 
+    {1}{128.90279pt}}
+\gdef \LT@civ {\LT@entry 
+    {1}{83.9859pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{94.65118pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.6}Bonus EDA: Reproducing the Reported Statistic}{96}{subsection.5.4.6}\protected@file@percent }
+\newlabel{bonus-eda-reproducing-the-reported-statistic}{{5.4.6}{96}{Bonus EDA: Reproducing the Reported Statistic}{subsection.5.4.6}{}}
+\gdef \LT@cv {\LT@entry 
+    {1}{86.74529pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{55.27501pt}}
+\gdef \LT@cvi {\LT@entry 
+    {1}{86.74529pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{55.27501pt}}
+\gdef \LT@cvii {\LT@entry 
+    {1}{83.9859pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{94.65118pt}}
+\gdef \LT@cviii {\LT@entry 
+    {1}{86.74529pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{55.27501pt}}
+\gdef \LT@cix {\LT@entry 
+    {1}{86.74529pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{55.27501pt}}
+\gdef \LT@cx {\LT@entry 
+    {1}{86.74529pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{55.27501pt}}
+\gdef \LT@cxi {\LT@entry 
+    {3}{49.95331pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{55.27501pt}}
+\gdef \LT@cxii {\LT@entry 
+    {3}{49.95331pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{80.67839pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {1}{100.65118pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {3}{61.27501pt}\LT@entry 
+    {1}{134.90279pt}\LT@entry 
+    {1}{134.90279pt}\LT@entry 
+    {1}{128.90279pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {5.5}EDA Demo 2: Mauna Loa CO2 Data -- A Lesson in Data Faithfulness}{100}{section.5.5}\protected@file@percent }
+\newlabel{eda-demo-2-mauna-loa-co2-data-a-lesson-in-data-faithfulness}{{5.5}{100}{EDA Demo 2: Mauna Loa CO2 Data -- A Lesson in Data Faithfulness}{section.5.5}{}}
+\gdef \LT@cxiii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {1}{17.47502pt}\LT@entry 
+    {3}{47.8941pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{15.12137pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.1}Reading this file into \texttt  {Pandas}?}{101}{subsection.5.5.1}\protected@file@percent }
+\newlabel{reading-this-file-into-pandas}{{5.5.1}{101}{\texorpdfstring {Reading this file into \texttt {Pandas}?}{Reading this file into Pandas?}}{subsection.5.5.1}{}}
+\gdef \LT@cxiv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {1}{27.51616pt}\LT@entry 
+    {1}{53.05156pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {1}{29.63011pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.2}Exploring Variable Feature Types}{102}{subsection.5.5.2}\protected@file@percent }
+\newlabel{exploring-variable-feature-types}{{5.5.2}{102}{Exploring Variable Feature Types}{subsection.5.5.2}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.3}Visualizing CO2}{102}{subsection.5.5.3}\protected@file@percent }
+\newlabel{visualizing-co2}{{5.5.3}{102}{Visualizing CO2}{subsection.5.5.3}{}}
+\gdef \LT@cxv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {1}{27.51616pt}\LT@entry 
+    {1}{53.05156pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {1}{29.63011pt}}
+\gdef \LT@cxvi {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {1}{27.51616pt}\LT@entry 
+    {1}{53.05156pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {1}{29.63011pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.4}Sanity Checks: Reasoning about the data}{104}{subsection.5.5.4}\protected@file@percent }
+\newlabel{sanity-checks-reasoning-about-the-data}{{5.5.4}{104}{Sanity Checks: Reasoning about the data}{subsection.5.5.4}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.5}Understanding Missing Value 1: \texttt  {Days}}{105}{subsection.5.5.5}\protected@file@percent }
+\newlabel{understanding-missing-value-1-days}{{5.5.5}{105}{\texorpdfstring {Understanding Missing Value 1: \texttt {Days}}{Understanding Missing Value 1: Days}}{subsection.5.5.5}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.6}Understanding Missing Value 2: \texttt  {Avg}}{107}{subsection.5.5.6}\protected@file@percent }
+\newlabel{understanding-missing-value-2-avg}{{5.5.6}{107}{\texorpdfstring {Understanding Missing Value 2: \texttt {Avg}}{Understanding Missing Value 2: Avg}}{subsection.5.5.6}{}}
+\gdef \LT@cxvii {\LT@entry 
+    {3}{22.42502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {1}{27.51616pt}\LT@entry 
+    {1}{53.05156pt}\LT@entry 
+    {3}{40.59045pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {1}{29.63011pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.7}Drop, \texttt  {NaN}, or Impute Missing \texttt  {Avg} Data?}{109}{subsection.5.5.7}\protected@file@percent }
+\newlabel{drop-nan-or-impute-missing-avg-data}{{5.5.7}{109}{\texorpdfstring {Drop, \texttt {NaN}, or Impute Missing \texttt {Avg} Data?}{Drop, NaN, or Impute Missing Avg Data?}}{subsection.5.5.7}{}}
+\gdef \LT@cxviii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {1}{27.51616pt}\LT@entry 
+    {1}{53.05156pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {1}{29.63011pt}}
+\gdef \LT@cxix {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {1}{27.51616pt}\LT@entry 
+    {1}{53.05156pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {1}{29.63011pt}}
+\gdef \LT@cxx {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {1}{27.51616pt}\LT@entry 
+    {1}{53.05156pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {3}{42.41911pt}\LT@entry 
+    {1}{29.63011pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.5.8}Presenting the Data: A Discussion on Data Granularity}{114}{subsection.5.5.8}\protected@file@percent }
+\newlabel{presenting-the-data-a-discussion-on-data-granularity}{{5.5.8}{114}{Presenting the Data: A Discussion on Data Granularity}{subsection.5.5.8}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {5.6}Summary}{115}{section.5.6}\protected@file@percent }
+\newlabel{summary}{{5.6}{115}{Summary}{section.5.6}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.6.1}Dealing with Missing Values}{116}{subsection.5.6.1}\protected@file@percent }
+\newlabel{dealing-with-missing-values}{{5.6.1}{116}{Dealing with Missing Values}{subsection.5.6.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.6.2}EDA and Data Wrangling}{116}{subsection.5.6.2}\protected@file@percent }
+\newlabel{eda-and-data-wrangling}{{5.6.2}{116}{EDA and Data Wrangling}{subsection.5.6.2}{}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {6}Regular Expressions}{117}{chapter.6}\protected@file@percent }
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\@writefile{lop}{\addvspace {10\p@ }}
+\newlabel{regular-expressions}{{6}{117}{Regular Expressions}{chapter.6}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.1}Why Work with Text?}{117}{section.6.1}\protected@file@percent }
+\newlabel{why-work-with-text}{{6.1}{117}{Why Work with Text?}{section.6.1}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.2}Python String Methods}{117}{section.6.2}\protected@file@percent }
+\newlabel{python-string-methods}{{6.2}{117}{Python String Methods}{section.6.2}{}}
+\gdef \LT@cxxi {\LT@entry 
+    {1}{146.90495pt}\LT@entry 
+    {1}{117.69pt}\LT@entry 
+    {1}{170.4117pt}}
+\gdef \LT@cxxii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{140.98004pt}\LT@entry 
+    {1}{30.9441pt}}
+\gdef \LT@cxxiii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{111.22888pt}\LT@entry 
+    {1}{58.1877pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Canonicalization}{118}{subsection.6.2.1}\protected@file@percent }
+\newlabel{canonicalization}{{6.2.1}{118}{Canonicalization}{subsection.6.2.1}{}}
+\gdef \LT@cxxiv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{140.98004pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{112.17119pt}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.2.1.1}Canonicalization with Python String Manipulation}{119}{subsubsection.6.2.1.1}\protected@file@percent }
+\newlabel{canonicalization-with-python-string-manipulation}{{6.2.1.1}{119}{Canonicalization with Python String Manipulation}{subsubsection.6.2.1.1}{}}
+\gdef \LT@cxxv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{111.22888pt}\LT@entry 
+    {1}{64.1877pt}\LT@entry 
+    {1}{112.17119pt}}
+\gdef \LT@cxxvi {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{111.22888pt}\LT@entry 
+    {1}{64.1877pt}\LT@entry 
+    {1}{118.17119pt}\LT@entry 
+    {1}{112.22594pt}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.2.1.2}Canonicalization with Pandas Series Methods}{120}{subsubsection.6.2.1.2}\protected@file@percent }
+\newlabel{canonicalization-with-pandas-series-methods}{{6.2.1.2}{120}{Canonicalization with Pandas Series Methods}{subsubsection.6.2.1.2}{}}
+\gdef \LT@cxxvii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{140.98004pt}\LT@entry 
+    {1}{36.9441pt}\LT@entry 
+    {1}{118.17119pt}\LT@entry 
+    {1}{112.22594pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Extraction}{121}{subsection.6.2.2}\protected@file@percent }
+\newlabel{extraction}{{6.2.2}{121}{Extraction}{subsection.6.2.2}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.3}RegEx Basics}{122}{section.6.3}\protected@file@percent }
+\newlabel{regex-basics}{{6.3}{122}{RegEx Basics}{section.6.3}{}}
+\gdef \LT@cxxviii {\LT@entry 
+    {1}{105.69pt}\LT@entry 
+    {1}{86.7675pt}\LT@entry 
+    {1}{82.61781pt}\LT@entry 
+    {1}{70.1383pt}\LT@entry 
+    {1}{89.0608pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}Basics RegEx Syntax}{123}{subsection.6.3.1}\protected@file@percent }
+\newlabel{basics-regex-syntax}{{6.3.1}{123}{Basics RegEx Syntax}{subsection.6.3.1}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.3.1.1}Examples}{123}{subsubsection.6.3.1.1}\protected@file@percent }
+\newlabel{examples}{{6.3.1.1}{123}{Examples}{subsubsection.6.3.1.1}{}}
+\gdef \LT@cxxix {\LT@entry 
+    {1}{197.70389pt}\LT@entry 
+    {1}{82.40508pt}\LT@entry 
+    {1}{78.5003pt}\LT@entry 
+    {1}{80.34746pt}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.4}RegEx Expanded}{124}{section.6.4}\protected@file@percent }
+\newlabel{regex-expanded}{{6.4}{124}{RegEx Expanded}{section.6.4}{}}
+\gdef \LT@cxxx {\LT@entry 
+    {1}{211.29303pt}\LT@entry 
+    {1}{211.29303pt}}
+\gdef \LT@cxxxi {\LT@entry 
+    {1}{197.70389pt}\LT@entry 
+    {1}{82.40508pt}\LT@entry 
+    {1}{78.5003pt}\LT@entry 
+    {1}{80.34746pt}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.4.0.1}Examples}{125}{subsubsection.6.4.0.1}\protected@file@percent }
+\newlabel{examples-1}{{6.4.0.1}{125}{Examples}{subsubsection.6.4.0.1}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.5}Convenient RegEx}{125}{section.6.5}\protected@file@percent }
+\newlabel{convenient-regex}{{6.5}{125}{Convenient RegEx}{section.6.5}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.5.1}Greediness}{126}{subsection.6.5.1}\protected@file@percent }
+\newlabel{greediness}{{6.5.1}{126}{Greediness}{subsection.6.5.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.5.2}Examples}{126}{subsection.6.5.2}\protected@file@percent }
+\newlabel{examples-2}{{6.5.2}{126}{Examples}{subsection.6.5.2}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.6}Regex in Python and Pandas (RegEx Groups)}{127}{section.6.6}\protected@file@percent }
+\newlabel{regex-in-python-and-pandas-regex-groups}{{6.6}{127}{Regex in Python and Pandas (RegEx Groups)}{section.6.6}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.6.1}Canonicalization}{127}{subsection.6.6.1}\protected@file@percent }
+\newlabel{canonicalization-1}{{6.6.1}{127}{Canonicalization}{subsection.6.6.1}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.6.1.1}Canonicalization with RegEx}{127}{subsubsection.6.6.1.1}\protected@file@percent }
+\newlabel{canonicalization-with-regex}{{6.6.1.1}{127}{Canonicalization with RegEx}{subsubsection.6.6.1.1}{}}
+\gdef \LT@cxxxii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{219.295pt}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.6.1.2}Canonicalization with \texttt  {pandas}}{128}{subsubsection.6.6.1.2}\protected@file@percent }
+\newlabel{canonicalization-with-pandas}{{6.6.1.2}{128}{\texorpdfstring {Canonicalization with \texttt {pandas}}{Canonicalization with pandas}}{subsubsection.6.6.1.2}{}}
+\gdef \LT@cxxxiii {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{155.69742pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.6.2}Extraction}{129}{subsection.6.6.2}\protected@file@percent }
+\newlabel{extraction-1}{{6.6.2}{129}{Extraction}{subsection.6.6.2}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.6.2.1}Extraction with RegEx}{129}{subsubsection.6.6.2.1}\protected@file@percent }
+\newlabel{extraction-with-regex}{{6.6.2.1}{129}{Extraction with RegEx}{subsubsection.6.6.2.1}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.6.2.2}Extraction with \texttt  {pandas}}{129}{subsubsection.6.6.2.2}\protected@file@percent }
+\newlabel{extraction-with-pandas}{{6.6.2.2}{129}{\texorpdfstring {Extraction with \texttt {pandas}}{Extraction with pandas}}{subsubsection.6.6.2.2}{}}
+\gdef \LT@cxxxiv {\LT@entry 
+    {3}{11.47502pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {3}{33.90001pt}\LT@entry 
+    {3}{27.90001pt}}
+\gdef \LT@cxxxv {\LT@entry 
+    {3}{452.76006pt}\LT@entry 
+    {1}{41.49931pt}\LT@entry 
+    {3}{28.42502pt}\LT@entry 
+    {3}{22.95001pt}\LT@entry 
+    {3}{27.90001pt}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.6.3}Regular Expression Capture Groups}{130}{subsection.6.6.3}\protected@file@percent }
+\newlabel{regular-expression-capture-groups}{{6.6.3}{130}{Regular Expression Capture Groups}{subsection.6.6.3}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.6.3.1}Example 1}{131}{subsubsection.6.6.3.1}\protected@file@percent }
+\newlabel{example-1}{{6.6.3.1}{131}{Example 1}{subsubsection.6.6.3.1}{}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {6.6.3.2}Example 2}{131}{subsubsection.6.6.3.2}\protected@file@percent }
+\newlabel{example-2}{{6.6.3.2}{131}{Example 2}{subsubsection.6.6.3.2}{}}
diff --git a/index.log b/index.log
new file mode 100644
index 000000000..0b2de934e
--- /dev/null
+++ b/index.log
@@ -0,0 +1,2767 @@
+This is XeTeX, Version 3.141592653-2.6-0.999994 (TeX Live 2022) (preloaded format=xelatex 2022.8.26)  27 AUG 2024 03:46
+entering extended mode
+ restricted \write18 enabled.
+ %&-line parsing enabled.
+**index.tex
+(./index.tex
+LaTeX2e <2021-11-15> patch level 1
+L3 programming layer <2022-02-24> (/usr/local/texlive/2022/texmf-dist/tex/latex
+/koma-script/scrreprt.cls
+Document Class: scrreprt 2021/11/13 v3.35 KOMA-Script document class (report)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/koma-script/scrkbase.sty
+Package: scrkbase 2021/11/13 v3.35 KOMA-Script package (KOMA-Script-dependent b
+asics and keyval usage)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/koma-script/scrbase.sty
+Package: scrbase 2021/11/13 v3.35 KOMA-Script package (KOMA-Script-independent 
+basics and keyval usage)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/koma-script/scrlfile.sty
+Package: scrlfile 2021/11/13 v3.35 KOMA-Script package (file load hooks)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/koma-script/scrlfile-hook.sty
+Package: scrlfile-hook 2021/11/13 v3.35 KOMA-Script package (using LaTeX hooks)
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/koma-script/scrlogo.sty
+Package: scrlogo 2021/11/13 v3.35 KOMA-Script package (logo)
+))) (/usr/local/texlive/2022/texmf-dist/tex/latex/graphics/keyval.sty
+Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
+\KV@toks@=\toks16
+)
+Applying: [2021/05/01] Usage of raw or classic option list on input line 252.
+Already applied: [0000/00/00] Usage of raw or classic option list on input line
+ 368.
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/koma-script/tocbasic.sty
+Package: tocbasic 2021/11/13 v3.35 KOMA-Script package (handling toc-files)
+\scr@dte@tocline@numberwidth=\skip47
+\scr@dte@tocline@numbox=\box50
+)
+Package tocbasic Info: babel extension for `toc' omitted
+(tocbasic)             because of missing \bbl@set@language on input line 137.
+Class scrreprt Info: File `scrsize11pt.clo' used instead of
+(scrreprt)           file `scrsize11.clo' to setup font sizes on input line 248
+7.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/koma-script/scrsize11pt.clo
+File: scrsize11pt.clo 2021/11/13 v3.35 KOMA-Script font size class option (11pt
+)
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/koma-script/typearea.sty
+Package: typearea 2021/11/13 v3.35 KOMA-Script package (type area)
+\ta@bcor=\skip48
+\ta@div=\count181
+Package typearea Info: You've used standard option `letterpaper'.
+(typearea)             This is correct!
+(typearea)             Internally I'm using `paper=letter'.
+(typearea)             If you'd like to set the option with \KOMAoptions,
+(typearea)             you'd have to use `paper=letter' there
+(typearea)             instead of `letterpaper', too.
+\ta@hblk=\skip49
+\ta@vblk=\skip50
+\ta@temp=\skip51
+\footheight=\skip52
+Package typearea Info: These are the values describing the layout:
+(typearea)             DIV  = 11
+(typearea)             BCOR = 0.0pt
+(typearea)             \paperwidth      = 614.295pt
+(typearea)              \textwidth      = 446.76004pt
+(typearea)              DIV departure   = -14%
+(typearea)              \evensidemargin = 11.49748pt
+(typearea)              \oddsidemargin  = 11.49748pt
+(typearea)             \paperheight     = 794.96999pt
+(typearea)              \textheight     = 582.20026pt
+(typearea)              \topmargin      = -37.40001pt
+(typearea)              \headheight     = 17.0pt
+(typearea)              \headsep        = 20.40001pt
+(typearea)              \topskip        = 11.0pt
+(typearea)              \footskip       = 47.6pt
+(typearea)              \baselineskip   = 13.6pt
+(typearea)              on input line 1743.
+)
+\c@part=\count182
+\c@chapter=\count183
+\c@section=\count184
+\c@subsection=\count185
+\c@subsubsection=\count186
+\c@paragraph=\count187
+\c@subparagraph=\count188
+\scr@dte@chapter@maxnumwidth=\skip53
+Class scrreprt Info: using compatibility default `afterindent=bysign'
+(scrreprt)           for `\chapter on input line 5717.
+\scr@dte@section@maxnumwidth=\skip54
+Class scrreprt Info: using compatibility default `runin=bysign'
+(scrreprt)           for `\section on input line 5728.
+Class scrreprt Info: using compatibility default `afterindent=bysign'
+(scrreprt)           for `\section on input line 5728.
+\scr@dte@part@maxnumwidth=\skip55
+Class scrreprt Info: using compatibility default `afterindent=true'
+(scrreprt)           for `\part on input line 5737.
+\scr@dte@subsection@maxnumwidth=\skip56
+Class scrreprt Info: using compatibility default `runin=bysign'
+(scrreprt)           for `\subsection on input line 5747.
+Class scrreprt Info: using compatibility default `afterindent=bysign'
+(scrreprt)           for `\subsection on input line 5747.
+\scr@dte@subsubsection@maxnumwidth=\skip57
+Class scrreprt Info: using compatibility default `runin=bysign'
+(scrreprt)           for `\subsubsection on input line 5757.
+Class scrreprt Info: using compatibility default `afterindent=bysign'
+(scrreprt)           for `\subsubsection on input line 5757.
+\scr@dte@paragraph@maxnumwidth=\skip58
+Class scrreprt Info: using compatibility default `runin=bysign'
+(scrreprt)           for `\paragraph on input line 5768.
+Class scrreprt Info: using compatibility default `afterindent=bysign'
+(scrreprt)           for `\paragraph on input line 5768.
+\scr@dte@subparagraph@maxnumwidth=\skip59
+Class scrreprt Info: using compatibility default `runin=bysign'
+(scrreprt)           for `\subparagraph on input line 5778.
+Class scrreprt Info: using compatibility default `afterindent=bysign'
+(scrreprt)           for `\subparagraph on input line 5778.
+\abovecaptionskip=\skip60
+\belowcaptionskip=\skip61
+\c@pti@nb@sid@b@x=\box51
+Package tocbasic Info: babel extension for `lof' omitted
+(tocbasic)             because of missing \bbl@set@language on input line 6958.
+
+\scr@dte@figure@maxnumwidth=\skip62
+\c@figure=\count189
+Package tocbasic Info: babel extension for `lot' omitted
+(tocbasic)             because of missing \bbl@set@language on input line 6974.
+
+\scr@dte@table@maxnumwidth=\skip63
+\c@table=\count190
+Class scrreprt Info: Redefining `\numberline' on input line 7142.
+\bibindent=\dimen138
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/amsmath/amsmath.sty
+Package: amsmath 2021/10/15 v2.17l AMS math features
+\@mathmargin=\skip64
+For additional information on amsmath, use the `?' option.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/amsmath/amstext.sty
+Package: amstext 2021/08/26 v2.01 AMS text
+(/usr/local/texlive/2022/texmf-dist/tex/latex/amsmath/amsgen.sty
+File: amsgen.sty 1999/11/30 v2.0 generic functions
+\@emptytoks=\toks17
+\ex@=\dimen139
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/amsmath/amsbsy.sty
+Package: amsbsy 1999/11/29 v1.2d Bold Symbols
+\pmbraise@=\dimen140
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/amsmath/amsopn.sty
+Package: amsopn 2021/08/26 v2.02 operator names
+)
+\inf@bad=\count191
+LaTeX Info: Redefining \frac on input line 234.
+\uproot@=\count192
+\leftroot@=\count193
+LaTeX Info: Redefining \overline on input line 399.
+\classnum@=\count194
+\DOTSCASE@=\count195
+LaTeX Info: Redefining \ldots on input line 496.
+LaTeX Info: Redefining \dots on input line 499.
+LaTeX Info: Redefining \cdots on input line 620.
+\Mathstrutbox@=\box52
+\strutbox@=\box53
+\big@size=\dimen141
+LaTeX Font Info:    Redeclaring font encoding OML on input line 743.
+LaTeX Font Info:    Redeclaring font encoding OMS on input line 744.
+\macc@depth=\count196
+\c@MaxMatrixCols=\count197
+\dotsspace@=\muskip16
+\c@parentequation=\count198
+\dspbrk@lvl=\count199
+\tag@help=\toks18
+\row@=\count266
+\column@=\count267
+\maxfields@=\count268
+\andhelp@=\toks19
+\eqnshift@=\dimen142
+\alignsep@=\dimen143
+\tagshift@=\dimen144
+\tagwidth@=\dimen145
+\totwidth@=\dimen146
+\lineht@=\dimen147
+\@envbody=\toks20
+\multlinegap=\skip65
+\multlinetaggap=\skip66
+\mathdisplay@stack=\toks21
+LaTeX Info: Redefining \[ on input line 2938.
+LaTeX Info: Redefining \] on input line 2939.
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/amsfonts/amssymb.sty
+Package: amssymb 2013/01/14 v3.01 AMS font symbols
+(/usr/local/texlive/2022/texmf-dist/tex/latex/amsfonts/amsfonts.sty
+Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
+\symAMSa=\mathgroup4
+\symAMSb=\mathgroup5
+LaTeX Font Info:    Redeclaring math symbol \hbar on input line 98.
+LaTeX Font Info:    Overwriting math alphabet `\mathfrak' in version `bold'
+(Font)                  U/euf/m/n --> U/euf/b/n on input line 106.
+)) (/usr/local/texlive/2022/texmf-dist/tex/generic/iftex/iftex.sty
+Package: iftex 2022/02/03 v1.0f TeX engine tests
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/unicode-math/unicode-math.sty (
+/usr/local/texlive/2022/texmf-dist/tex/latex/l3kernel/expl3.sty
+Package: expl3 2022-02-24 L3 programming layer (loader) 
+(/usr/local/texlive/2022/texmf-dist/tex/latex/l3backend/l3backend-xetex.def
+File: l3backend-xetex.def 2022-02-07 L3 backend support: XeTeX
+(|extractbb --version)
+\c__kernel_sys_dvipdfmx_version_int=\count269
+\l__color_backend_stack_int=\count270
+\g__color_backend_stack_int=\count271
+\g__graphics_track_int=\count272
+\l__pdf_internal_box=\box54
+\g__pdf_backend_object_int=\count273
+\g__pdf_backend_annotation_int=\count274
+\g__pdf_backend_link_int=\count275
+))
+Package: unicode-math 2020/01/31 v0.8q Unicode maths in XeLaTeX and LuaLaTeX
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/unicode-math/unicode-math-xetex.s
+ty
+Package: unicode-math-xetex 2020/01/31 v0.8q Unicode maths in XeLaTeX and LuaLa
+TeX
+(/usr/local/texlive/2022/texmf-dist/tex/latex/l3packages/xparse/xparse.sty
+Package: xparse 2022-01-12 L3 Experimental document command parser
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/l3packages/l3keys2e/l3keys2e.st
+y
+Package: l3keys2e 2022-01-12 LaTeX2e option processing using LaTeX3 keys
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/fontspec/fontspec.sty
+Package: fontspec 2022/01/15 v2.8a Font selection for XeLaTeX and LuaLaTeX
+(/usr/local/texlive/2022/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty
+Package: fontspec-xetex 2022/01/15 v2.8a Font selection for XeLaTeX and LuaLaTe
+X
+\l__fontspec_script_int=\count276
+\l__fontspec_language_int=\count277
+\l__fontspec_strnum_int=\count278
+\l__fontspec_tmp_int=\count279
+\l__fontspec_tmpa_int=\count280
+\l__fontspec_tmpb_int=\count281
+\l__fontspec_tmpc_int=\count282
+\l__fontspec_em_int=\count283
+\l__fontspec_emdef_int=\count284
+\l__fontspec_strong_int=\count285
+\l__fontspec_strongdef_int=\count286
+\l__fontspec_tmpa_dim=\dimen148
+\l__fontspec_tmpb_dim=\dimen149
+\l__fontspec_tmpc_dim=\dimen150
+(/usr/local/texlive/2022/texmf-dist/tex/latex/base/fontenc.sty
+Package: fontenc 2021/04/29 v2.0v Standard LaTeX package
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (/usr/
+local/texlive/2022/texmf-dist/tex/latex/base/fix-cm.sty
+Package: fix-cm 2020/11/24 v1.1t fixes to LaTeX
+(/usr/local/texlive/2022/texmf-dist/tex/latex/base/ts1enc.def
+File: ts1enc.def 2001/06/05 v3.0e (jk/car/fm) Standard LaTeX file
+LaTeX Font Info:    Redeclaring font encoding TS1 on input line 47.
+))
+\g__um_fam_int=\count287
+\g__um_fonts_used_int=\count288
+\l__um_primecount_int=\count289
+\g__um_primekern_muskip=\muskip17
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/unicode-math/unicode-math-table.t
+ex))) (/usr/local/texlive/2022/texmf-dist/tex/latex/lm/lmodern.sty
+Package: lmodern 2015/05/01 v1.6.1 Latin Modern Fonts
+LaTeX Font Info:    Overwriting symbol font `operators' in version `normal'
+(Font)                  OT1/cmr/m/n --> OT1/lmr/m/n on input line 22.
+LaTeX Font Info:    Overwriting symbol font `letters' in version `normal'
+(Font)                  OML/cmm/m/it --> OML/lmm/m/it on input line 23.
+LaTeX Font Info:    Overwriting symbol font `symbols' in version `normal'
+(Font)                  OMS/cmsy/m/n --> OMS/lmsy/m/n on input line 24.
+LaTeX Font Info:    Overwriting symbol font `largesymbols' in version `normal'
+(Font)                  OMX/cmex/m/n --> OMX/lmex/m/n on input line 25.
+LaTeX Font Info:    Overwriting symbol font `operators' in version `bold'
+(Font)                  OT1/cmr/bx/n --> OT1/lmr/bx/n on input line 26.
+LaTeX Font Info:    Overwriting symbol font `letters' in version `bold'
+(Font)                  OML/cmm/b/it --> OML/lmm/b/it on input line 27.
+LaTeX Font Info:    Overwriting symbol font `symbols' in version `bold'
+(Font)                  OMS/cmsy/b/n --> OMS/lmsy/b/n on input line 28.
+LaTeX Font Info:    Overwriting symbol font `largesymbols' in version `bold'
+(Font)                  OMX/cmex/m/n --> OMX/lmex/m/n on input line 29.
+LaTeX Font Info:    Overwriting math alphabet `\mathbf' in version `normal'
+(Font)                  OT1/cmr/bx/n --> OT1/lmr/bx/n on input line 31.
+LaTeX Font Info:    Overwriting math alphabet `\mathsf' in version `normal'
+(Font)                  OT1/cmss/m/n --> OT1/lmss/m/n on input line 32.
+LaTeX Font Info:    Overwriting math alphabet `\mathit' in version `normal'
+(Font)                  OT1/cmr/m/it --> OT1/lmr/m/it on input line 33.
+LaTeX Font Info:    Overwriting math alphabet `\mathtt' in version `normal'
+(Font)                  OT1/cmtt/m/n --> OT1/lmtt/m/n on input line 34.
+LaTeX Font Info:    Overwriting math alphabet `\mathbf' in version `bold'
+(Font)                  OT1/cmr/bx/n --> OT1/lmr/bx/n on input line 35.
+LaTeX Font Info:    Overwriting math alphabet `\mathsf' in version `bold'
+(Font)                  OT1/cmss/bx/n --> OT1/lmss/bx/n on input line 36.
+LaTeX Font Info:    Overwriting math alphabet `\mathit' in version `bold'
+(Font)                  OT1/cmr/bx/it --> OT1/lmr/bx/it on input line 37.
+LaTeX Font Info:    Overwriting math alphabet `\mathtt' in version `bold'
+(Font)                  OT1/cmtt/m/n --> OT1/lmtt/m/n on input line 38.
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/upquote/upquote.sty
+Package: upquote 2012/04/19 v1.3 upright-quote and grave-accent glyphs in verba
+tim
+(/usr/local/texlive/2022/texmf-dist/tex/latex/base/textcomp.sty
+Package: textcomp 2020/02/02 v2.0n Standard LaTeX package
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/microtype/microtype.sty
+Package: microtype 2022/03/14 v3.0d Micro-typographical refinements (RS)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/etoolbox/etoolbox.sty
+Package: etoolbox 2020/10/05 v2.5k e-TeX tools for LaTeX (JAW)
+\etb@tempcnta=\count290
+)
+\MT@toks=\toks22
+\MT@tempbox=\box55
+\MT@count=\count291
+LaTeX Info: Redefining \noprotrusionifhmode on input line 1027.
+LaTeX Info: Redefining \leftprotrusion on input line 1028.
+LaTeX Info: Redefining \rightprotrusion on input line 1036.
+LaTeX Info: Redefining \textls on input line 1195.
+\MT@outer@kern=\dimen151
+LaTeX Info: Redefining \textmicrotypecontext on input line 1781.
+\MT@listname@count=\count292
+(/usr/local/texlive/2022/texmf-dist/tex/latex/microtype/microtype-xetex.def
+File: microtype-xetex.def 2022/03/14 v3.0d Definitions specific to xetex (RS)
+LaTeX Info: Redefining \lsstyle on input line 236.
+)
+Package microtype Info: Loading configuration file microtype.cfg.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/microtype/microtype.cfg
+File: microtype.cfg 2022/03/14 v3.0d microtype main configuration file (RS)
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/xcolor/xcolor.sty
+Package: xcolor 2021/10/31 v2.13 LaTeX color extensions (UK)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/graphics-cfg/color.cfg
+File: color.cfg 2016/01/02 v1.6 sample color configuration
+)
+Package xcolor Info: Driver file: xetex.def on input line 227.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/graphics-def/xetex.def
+File: xetex.def 2021/03/18 v5.0k Graphics/color driver for xetex
+)
+Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1352.
+Package xcolor Info: Model `RGB' extended on input line 1368.
+Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1370.
+Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1371.
+Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1372.
+Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1373.
+Package xcolor Info: Model `Gray' substituted by `gray' on input line 1374.
+Package xcolor Info: Model `wave' substituted by `hsb' on input line 1375.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/graphics/dvipsnam.def
+File: dvipsnam.def 2016/06/17 v3.0m Driver-dependent file (DPC,SPQR)
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/xcolor/svgnam.def
+File: svgnam.def 2021/10/31 v2.13 Predefined colors according to SVG 1.1 (UK)
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/xcolor/x11nam.def
+File: x11nam.def 2021/10/31 v2.13 Predefined colors according to Unix/X11 (UK)
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/fancyvrb/fancyvrb.sty
+Package: fancyvrb 2021/12/21 4.1b verbatim text (tvz,hv)
+\FV@CodeLineNo=\count293
+\FV@InFile=\read2
+\FV@TabBox=\box56
+\c@FancyVerbLine=\count294
+\FV@StepNumber=\count295
+\FV@OutFile=\write3
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/framed/framed.sty
+Package: framed 2011/10/22 v 0.96: framed or shaded text with page breaks
+\OuterFrameSep=\skip67
+\fb@frw=\dimen152
+\fb@frh=\dimen153
+\FrameRule=\dimen154
+\FrameSep=\dimen155
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/tools/longtable.sty
+Package: longtable 2021-09-01 v4.17 Multi-page Table package (DPC)
+\LTleft=\skip68
+\LTright=\skip69
+\LTpre=\skip70
+\LTpost=\skip71
+\LTchunksize=\count296
+\LTcapwidth=\dimen156
+\LT@head=\box57
+\LT@firsthead=\box58
+\LT@foot=\box59
+\LT@lastfoot=\box60
+\LT@gbox=\box61
+\LT@cols=\count297
+\LT@rows=\count298
+\c@LT@tables=\count299
+\c@LT@chunks=\count300
+\LT@p@ftn=\toks23
+)
+Class scrreprt Info: longtable captions redefined on input line 112.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/booktabs/booktabs.sty
+Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
+\heavyrulewidth=\dimen157
+\lightrulewidth=\dimen158
+\cmidrulewidth=\dimen159
+\belowrulesep=\dimen160
+\belowbottomsep=\dimen161
+\aboverulesep=\dimen162
+\abovetopsep=\dimen163
+\cmidrulesep=\dimen164
+\cmidrulekern=\dimen165
+\defaultaddspace=\dimen166
+\@cmidla=\count301
+\@cmidlb=\count302
+\@aboverulesep=\dimen167
+\@belowrulesep=\dimen168
+\@thisruleclass=\count303
+\@lastruleclass=\count304
+\@thisrulewidth=\dimen169
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/tools/array.sty
+Package: array 2021/10/04 v2.5f Tabular extension package (FMi)
+\col@sep=\dimen170
+\ar@mcellbox=\box62
+\extrarowheight=\dimen171
+\NC@list=\toks24
+\extratabsurround=\skip72
+\backup@length=\skip73
+\ar@cellbox=\box63
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/multirow/multirow.sty
+Package: multirow 2021/03/15 v2.8 Span multiple rows of a table
+\multirow@colwidth=\skip74
+\multirow@cntb=\count305
+\multirow@dima=\skip75
+\bigstrutjot=\dimen172
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/tools/calc.sty
+Package: calc 2017/05/25 v4.3 Infix arithmetic (KKT,FJ)
+\calc@Acount=\count306
+\calc@Bcount=\count307
+\calc@Adimen=\dimen173
+\calc@Bdimen=\dimen174
+\calc@Askip=\skip76
+\calc@Bskip=\skip77
+LaTeX Info: Redefining \setlength on input line 80.
+LaTeX Info: Redefining \addtolength on input line 81.
+\calc@Ccount=\count308
+\calc@Cskip=\skip78
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/footnotehyper/footnotehyper.sty
+Package: footnotehyper 2021/08/13 v1.1e hyperref aware footnote.sty (JFB)
+\FNH@notes=\box64
+\FNH@width=\dimen175
+\FNH@toks=\toks25
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/graphics/graphicx.sty
+Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/graphics/graphics.sty
+Package: graphics 2021/03/04 v1.4d Standard LaTeX Graphics (DPC,SPQR)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/graphics/trig.sty
+Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
+File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
+)
+Package graphics Info: Driver file: xetex.def on input line 107.
+)
+\Gin@req@height=\dimen176
+\Gin@req@width=\dimen177
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/tcolorbox/tcolorbox.sty
+Package: tcolorbox 2022/01/07 version 5.0.2 text color boxes
+(/usr/local/texlive/2022/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty (/usr/loca
+l/texlive/2022/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.te
+x
+\pgfutil@everybye=\toks26
+\pgfutil@tempdima=\dimen178
+\pgfutil@tempdimb=\dimen179
+
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/utilities/pgfutil-common-li
+sts.tex))
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def
+\pgfutil@abb=\box65
+) (/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/pgf.revision.tex)
+Package: pgfrcs 2021/05/15 v3.1.9a (3.1.9a)
+))
+Package: pgf 2021/05/15 v3.1.9a (3.1.9a)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty (/usr/
+local/texlive/2022/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex
+Package: pgfsys 2021/05/15 v3.1.9a (3.1.9a)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex
+\pgfkeys@pathtoks=\toks27
+\pgfkeys@temptoks=\toks28
+
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/utilities/pgfkeysfiltered.c
+ode.tex
+\pgfkeys@tmptoks=\toks29
+))
+\pgf@x=\dimen180
+\pgf@y=\dimen181
+\pgf@xa=\dimen182
+\pgf@ya=\dimen183
+\pgf@xb=\dimen184
+\pgf@yb=\dimen185
+\pgf@xc=\dimen186
+\pgf@yc=\dimen187
+\pgf@xd=\dimen188
+\pgf@yd=\dimen189
+\w@pgf@writea=\write4
+\r@pgf@reada=\read3
+\c@pgf@counta=\count309
+\c@pgf@countb=\count310
+\c@pgf@countc=\count311
+\c@pgf@countd=\count312
+\t@pgf@toka=\toks30
+\t@pgf@tokb=\toks31
+\t@pgf@tokc=\toks32
+\pgf@sys@id@count=\count313
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg
+File: pgf.cfg 2021/05/15 v3.1.9a (3.1.9a)
+)
+Driver file for pgf: pgfsys-xetex.def
+
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-xetex.de
+f
+File: pgfsys-xetex.def 2021/05/15 v3.1.9a (3.1.9a)
+
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-dvipdfmx
+.def
+File: pgfsys-dvipdfmx.def 2021/05/15 v3.1.9a (3.1.9a)
+
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-p
+df.def
+File: pgfsys-common-pdf.def 2021/05/15 v3.1.9a (3.1.9a)
+)
+\pgfsys@objnum=\count314
+)))
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.
+code.tex
+File: pgfsyssoftpath.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgfsyssoftpath@smallbuffer@items=\count315
+\pgfsyssoftpath@bigbuffer@items=\count316
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.
+code.tex
+File: pgfsysprotocol.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+))
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex
+Package: pgfcore 2021/05/15 v3.1.9a (3.1.9a)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex (/usr
+/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex (/usr/
+local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex
+\pgfmath@dimen=\dimen190
+\pgfmath@count=\count317
+\pgfmath@box=\box66
+\pgfmath@toks=\toks33
+\pgfmath@stack@operand=\toks34
+\pgfmath@stack@operation=\toks35
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.
+tex
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic
+.code.tex)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigo
+nometric.code.tex)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.rando
+m.code.tex)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.compa
+rison.code.tex)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.
+code.tex)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round
+.code.tex)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.
+code.tex)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integ
+erarithmetics.code.tex))) (/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/m
+ath/pgfmathfloat.code.tex
+\c@pgfmathroundto@lastzeros=\count318
+)) (/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfint.code.tex)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.co
+de.tex
+File: pgfcorepoints.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgf@picminx=\dimen191
+\pgf@picmaxx=\dimen192
+\pgf@picminy=\dimen193
+\pgf@picmaxy=\dimen194
+\pgf@pathminx=\dimen195
+\pgf@pathmaxx=\dimen196
+\pgf@pathminy=\dimen197
+\pgf@pathmaxy=\dimen198
+\pgf@xx=\dimen199
+\pgf@xy=\dimen256
+\pgf@yx=\dimen257
+\pgf@yy=\dimen258
+\pgf@zx=\dimen259
+\pgf@zy=\dimen260
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconst
+ruct.code.tex
+File: pgfcorepathconstruct.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgf@path@lastx=\dimen261
+\pgf@path@lasty=\dimen262
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage
+.code.tex
+File: pgfcorepathusage.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgf@shorten@end@additional=\dimen263
+\pgf@shorten@start@additional=\dimen264
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.co
+de.tex
+File: pgfcorescopes.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgfpic=\box67
+\pgf@hbox=\box68
+\pgf@layerbox@main=\box69
+\pgf@picture@serial@count=\count319
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicst
+ate.code.tex
+File: pgfcoregraphicstate.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgflinewidth=\dimen265
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransform
+ations.code.tex
+File: pgfcoretransformations.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgf@pt@x=\dimen266
+\pgf@pt@y=\dimen267
+\pgf@pt@temp=\dimen268
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.cod
+e.tex
+File: pgfcorequick.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.c
+ode.tex
+File: pgfcoreobjects.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathproce
+ssing.code.tex
+File: pgfcorepathprocessing.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.co
+de.tex
+File: pgfcorearrows.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgfarrowsep=\dimen269
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.cod
+e.tex
+File: pgfcoreshade.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgf@max=\dimen270
+\pgf@sys@shading@range@num=\count320
+\pgf@shadingcount=\count321
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.cod
+e.tex
+File: pgfcoreimage.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.
+code.tex
+File: pgfcoreexternal.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgfexternal@startupbox=\box70
+))
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.co
+de.tex
+File: pgfcorelayers.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretranspare
+ncy.code.tex
+File: pgfcoretransparency.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.
+code.tex
+File: pgfcorepatterns.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.
+tex
+File: pgfcorerdf.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+)))
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.cod
+e.tex
+File: pgfmoduleshapes.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgfnodeparttextbox=\box71
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.
+tex
+File: pgfmoduleplot.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version
+-0-65.sty
+Package: pgfcomp-version-0-65 2021/05/15 v3.1.9a (3.1.9a)
+\pgf@nodesepstart=\dimen271
+\pgf@nodesepend=\dimen272
+)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version
+-1-18.sty
+Package: pgfcomp-version-1-18 2021/05/15 v3.1.9a (3.1.9a)
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/tools/verbatim.sty
+Package: verbatim 2020-07-07 v1.5u LaTeX2e package for verbatim enhancements
+\every@verbatim=\toks36
+\verbatim@line=\toks37
+\verbatim@in@stream=\read4
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/environ/environ.sty
+Package: environ 2014/05/04 v0.3 A new way to define environments
+(/usr/local/texlive/2022/texmf-dist/tex/latex/trimspaces/trimspaces.sty
+Package: trimspaces 2009/09/17 v1.1 Trim spaces around a token list
+))
+\tcb@titlebox=\box72
+\tcb@upperbox=\box73
+\tcb@lowerbox=\box74
+\tcb@phantombox=\box75
+\c@tcbbreakpart=\count322
+\c@tcblayer=\count323
+\c@tcolorbox@number=\count324
+\tcb@temp=\box76
+\tcb@temp=\box77
+\tcb@temp=\box78
+\tcb@temp=\box79
+(/usr/local/texlive/2022/texmf-dist/tex/latex/tcolorbox/tcbskins.code.tex
+Library (tcolorbox): 'tcbskins.code.tex' version '5.0.2'
+(/usr/local/texlive/2022/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty (/usr/
+local/texlive/2022/texmf-dist/tex/latex/pgf/utilities/pgffor.sty (/usr/local/te
+xlive/2022/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty (/usr/local/texlive/2
+022/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex)) (/usr/local/texlive
+/2022/texmf-dist/tex/latex/pgf/math/pgfmath.sty (/usr/local/texlive/2022/texmf-
+dist/tex/generic/pgf/math/pgfmath.code.tex)) (/usr/local/texlive/2022/texmf-dis
+t/tex/generic/pgf/utilities/pgffor.code.tex
+Package: pgffor 2021/05/15 v3.1.9a (3.1.9a)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex)
+\pgffor@iter=\dimen273
+\pgffor@skip=\dimen274
+\pgffor@stack=\toks38
+\pgffor@toks=\toks39
+))
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.cod
+e.tex
+Package: tikz 2021/05/15 v3.1.9a (3.1.9a)
+
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothan
+dlers.code.tex
+File: pgflibraryplothandlers.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgf@plot@mark@count=\count325
+\pgfplotmarksize=\dimen275
+)
+\tikz@lastx=\dimen276
+\tikz@lasty=\dimen277
+\tikz@lastxsaved=\dimen278
+\tikz@lastysaved=\dimen279
+\tikz@lastmovetox=\dimen280
+\tikz@lastmovetoy=\dimen281
+\tikzleveldistance=\dimen282
+\tikzsiblingdistance=\dimen283
+\tikz@figbox=\box80
+\tikz@figbox@bg=\box81
+\tikz@tempbox=\box82
+\tikz@tempbox@bg=\box83
+\tikztreelevel=\count326
+\tikznumberofchildren=\count327
+\tikznumberofcurrentchild=\count328
+\tikz@fig@count=\count329
+
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.cod
+e.tex
+File: pgfmodulematrix.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+\pgfmatrixcurrentrow=\count330
+\pgfmatrixcurrentcolumn=\count331
+\pgf@matrix@numberofcolumns=\count332
+)
+\tikz@expandcount=\count333
+
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pgf/frontendlayer/tikz/librarie
+s/tikzlibrarytopaths.code.tex
+File: tikzlibrarytopaths.code.tex 2021/05/15 v3.1.9a (3.1.9a)
+)))
+\tcb@waterbox=\box84
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/tcolorbox/tcbskinsjigsaw.code.tex
+Library (tcolorbox): 'tcbskinsjigsaw.code.tex' version '5.0.2'
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/tcolorbox/tcbbreakable.code.te
+x
+Library (tcolorbox): 'tcbbreakable.code.tex' version '5.0.2'
+(/usr/local/texlive/2022/texmf-dist/tex/generic/oberdiek/pdfcol.sty
+Package: pdfcol 2019/12/29 v1.6 Handle new color stacks for pdfTeX (HO)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
+Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO)
+) (/usr/local/texlive/2022/texmf-dist/tex/generic/infwarerr/infwarerr.sty
+Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
+)
+Package pdfcol Info: Interface disabled because of missing PDF mode of pdfTeX.
+)
+Package pdfcol Info: pdfTeX's color stacks are not available.
+\tcb@testbox=\box85
+\tcb@totalupperbox=\box86
+\tcb@totallowerbox=\box87
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/fontawesome5/fontawesome5.sty
+Package: fontawesome5 2021/06/04 v5.15.3 Font Awesome 5
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/fontawesome5/fontawesome5-utex-he
+lper.sty
+Package: fontawesome5-utex-helper 2021/06/04 v5.15.3 uTeX helper for fontawesom
+e5
+LaTeX Font Info:    Trying to load font information for TU+fontawesomefree on i
+nput line 69.
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/fontawesome5/tufontawesomefree.fd
+)
+LaTeX Font Info:    Trying to load font information for TU+fontawesomebrands on
+ input line 70.
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/fontawesome5/tufontawesomebrands.
+fd))) (/usr/local/texlive/2022/texmf-dist/tex/latex/bookmark/bookmark.sty
+Package: bookmark 2020-11-06 v1.29 PDF bookmarks (HO)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/hyperref/hyperref.sty
+Package: hyperref 2022-02-21 v7.00n Hypertext links for LaTeX
+(/usr/local/texlive/2022/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
+Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO
+)
+Package pdftexcmds Info: \pdf@primitive is available.
+Package pdftexcmds Info: \pdf@ifprimitive is available.
+Package pdftexcmds Info: \pdfdraftmode not found.
+) (/usr/local/texlive/2022/texmf-dist/tex/generic/kvsetkeys/kvsetkeys.sty
+Package: kvsetkeys 2019/12/15 v1.18 Key value parser (HO)
+) (/usr/local/texlive/2022/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
+Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
+) (/usr/local/texlive/2022/texmf-dist/tex/generic/pdfescape/pdfescape.sty
+Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/hycolor/hycolor.sty
+Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty
+Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO)
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/auxhook/auxhook.sty
+Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO)
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/kvoptions/kvoptions.sty
+Package: kvoptions 2020-10-07 v3.14 Key value format for package options (HO)
+)
+\@linkdim=\dimen284
+\Hy@linkcounter=\count334
+\Hy@pagecounter=\count335
+(/usr/local/texlive/2022/texmf-dist/tex/latex/hyperref/pd1enc.def
+File: pd1enc.def 2022-02-21 v7.00n Hyperref: PDFDocEncoding definition (HO)
+) (/usr/local/texlive/2022/texmf-dist/tex/generic/intcalc/intcalc.sty
+Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO)
+) (/usr/local/texlive/2022/texmf-dist/tex/generic/etexcmds/etexcmds.sty
+Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO)
+)
+\Hy@SavedSpaceFactor=\count336
+(/usr/local/texlive/2022/texmf-dist/tex/latex/hyperref/puenc.def
+File: puenc.def 2022-02-21 v7.00n Hyperref: PDF Unicode definition (HO)
+)
+Package hyperref Info: Option `unicode' set `true' on input line 4018.
+Package hyperref Info: Hyper figures OFF on input line 4137.
+Package hyperref Info: Link nesting OFF on input line 4142.
+Package hyperref Info: Hyper index ON on input line 4145.
+Package hyperref Info: Plain pages OFF on input line 4152.
+Package hyperref Info: Backreferencing OFF on input line 4157.
+Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
+Package hyperref Info: Bookmarks ON on input line 4390.
+\c@Hy@tempcnt=\count337
+(/usr/local/texlive/2022/texmf-dist/tex/latex/url/url.sty
+\Urlmuskip=\muskip18
+Package: url 2013/09/16  ver 3.4  Verb mode for urls, etc.
+)
+LaTeX Info: Redefining \url on input line 4749.
+\XeTeXLinkMargin=\dimen285
+(/usr/local/texlive/2022/texmf-dist/tex/generic/bitset/bitset.sty
+Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
+Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO
+)
+))
+\Fld@menulength=\count338
+\Field@Width=\dimen286
+\Fld@charsize=\dimen287
+Package hyperref Info: Hyper figures OFF on input line 6027.
+Package hyperref Info: Link nesting OFF on input line 6032.
+Package hyperref Info: Hyper index ON on input line 6035.
+Package hyperref Info: backreferencing OFF on input line 6042.
+Package hyperref Info: Link coloring OFF on input line 6047.
+Package hyperref Info: Link coloring with OCG OFF on input line 6052.
+Package hyperref Info: PDF/A mode OFF on input line 6057.
+LaTeX Info: Redefining \ref on input line 6097.
+LaTeX Info: Redefining \pageref on input line 6101.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/base/atbegshi-ltx.sty
+Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi
+package with kernel methods
+)
+\Hy@abspage=\count339
+\c@Item=\count340
+\c@Hfootnote=\count341
+)
+Package hyperref Info: Driver (autodetected): hxetex.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/hyperref/hxetex.def
+File: hxetex.def 2022-02-21 v7.00n Hyperref driver for XeTeX
+(/usr/local/texlive/2022/texmf-dist/tex/generic/stringenc/stringenc.sty
+Package: stringenc 2019/11/29 v1.12 Convert strings between diff. encodings (HO
+)
+)
+\pdfm@box=\box88
+\c@Hy@AnnotLevel=\count342
+\HyField@AnnotCount=\count343
+\Fld@listcount=\count344
+\c@bookmark@seq@number=\count345
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
+Package: rerunfilecheck 2019/12/05 v1.9 Rerun checks for auxiliary files (HO)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/base/atveryend-ltx.sty
+Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend pac
+kage
+with kernel methods
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
+Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
+)
+Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2
+86.
+)
+\Hy@SectionHShift=\skip79
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/bookmark/bkm-dvipdfm.def
+File: bkm-dvipdfm.def 2020-11-06 v1.29 bookmark driver for dvipdfm (HO)
+\BKM@id=\count346
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/caption/caption.sty
+Package: caption 2022/03/01 v3.6b Customizing captions (AR)
+(/usr/local/texlive/2022/texmf-dist/tex/latex/caption/caption3.sty
+Package: caption3 2022/03/17 v2.3b caption3 kernel (AR)
+\caption@tempdima=\dimen288
+\captionmargin=\dimen289
+\caption@leftmargin=\dimen290
+\caption@rightmargin=\dimen291
+\caption@width=\dimen292
+\caption@indent=\dimen293
+\caption@parindent=\dimen294
+\caption@hangindent=\dimen295
+Package caption Info: KOMA-Script document class detected.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/caption/caption-koma.sto
+File: caption-koma.sto 2020/09/21 v2.0b Adaption of the caption package to the 
+KOMA-Script document classes (AR)
+))
+\c@caption@flags=\count347
+\c@continuedfloat=\count348
+Package caption Info: hyperref package is loaded.
+Package caption Info: longtable package is loaded.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/caption/ltcaption.sty
+Package: ltcaption 2021/01/08 v1.4c longtable captions (AR)
+)) (/usr/local/texlive/2022/texmf-dist/tex/latex/float/float.sty
+Package: float 2001/11/08 v1.3d Float enhancements (AL)
+\c@float@type=\count349
+\float@exts=\toks40
+\float@box=\box89
+\@float@everytoks=\toks41
+\@floatcapt=\box90
+)
+\@float@every@codelisting=\toks42
+\c@codelisting=\count350
+(/usr/local/texlive/2022/texmf-dist/tex/latex/caption/subcaption.sty
+Package: subcaption 2022/01/07 v1.5 Sub-captions (AR)
+\c@subfigure=\count351
+\c@subtable=\count352
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/xurl/xurl.sty
+Package: xurl 2022/01/09 v 0.10 modify URL breaks
+)
+Package hyperref Info: Option `colorlinks' set `true' on input line 213.
+No file index.aux.
+\openout1 = `index.aux'.
+
+LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+LaTeX Font Info:    Checking defaults for TS1/cmr/m/n on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+LaTeX Font Info:    Checking defaults for TU/lmr/m/n on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+LaTeX Font Info:    Checking defaults for PD1/pdf/m/n on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+LaTeX Font Info:    Checking defaults for PU/pdf/m/n on input line 229.
+LaTeX Font Info:    ... okay on input line 229.
+Package scrbase Info: activating english \contentsname on input line 229.
+Package scrbase Info: activating english \listfigurename on input line 229.
+Package scrbase Info: activating english \listtablename on input line 229.
+LaTeX Font Info:    Overwriting math alphabet `\mathrm' in version `normal'
+(Font)                  OT1/lmr/m/n --> TU/lmr/m/n on input line 229.
+LaTeX Font Info:    Overwriting math alphabet `\mathit' in version `normal'
+(Font)                  OT1/lmr/m/it --> TU/lmr/m/it on input line 229.
+LaTeX Font Info:    Overwriting math alphabet `\mathbf' in version `normal'
+(Font)                  OT1/lmr/bx/n --> TU/lmr/bx/n on input line 229.
+LaTeX Font Info:    Overwriting math alphabet `\mathsf' in version `normal'
+(Font)                  OT1/lmss/m/n --> TU/lmss/m/n on input line 229.
+LaTeX Font Info:    Overwriting math alphabet `\mathsf' in version `bold'
+(Font)                  OT1/lmss/bx/n --> TU/lmss/bx/n on input line 229.
+LaTeX Font Info:    Overwriting math alphabet `\mathtt' in version `normal'
+(Font)                  OT1/lmtt/m/n --> TU/lmtt/m/n on input line 229.
+LaTeX Font Info:    Overwriting math alphabet `\mathtt' in version `bold'
+(Font)                  OT1/lmtt/m/n --> TU/lmtt/bx/n on input line 229.
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: Font family 'latinmodern-math.otf(0)' created for font
+(fontspec)             'latinmodern-math.otf' with options
+(fontspec)             [Scale=MatchLowercase,BoldItalicFont={},ItalicFont={},Sm
+allCapsFont={},Script=Math,BoldFont={latinmodern-math.otf}].
+(fontspec)              
+(fontspec)              This font family consists of the following NFSS
+(fontspec)             series/shapes:
+(fontspec)              
+(fontspec)             - 'normal' (m/n) with NFSS spec.:
+(fontspec)             <->s*[0.9999967668407183]"[latinmodern-math.otf]/OT:scri
+pt=math;language=dflt;"
+(fontspec)             - 'small caps'  (m/sc) with NFSS spec.: 
+(fontspec)             - 'bold' (b/n) with NFSS spec.:
+(fontspec)             <->s*[0.9999967668407183]"[latinmodern-math.otf]/OT:scri
+pt=math;language=dflt;"
+(fontspec)             - 'bold small caps'  (b/sc) with NFSS spec.: 
+
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(0)/m/n' will be
+(Font)              scaled to size 10.95pt on input line 229.
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: Font family 'latinmodern-math.otf(1)' created for font
+(fontspec)             'latinmodern-math.otf' with options
+(fontspec)             [Scale=MatchLowercase,BoldItalicFont={},ItalicFont={},Sm
+allCapsFont={},Script=Math,SizeFeatures={{Size=9.3075-},{Size=6.57-9.3075,Font=
+latinmodern-math.otf,Style=MathScript},{Size=-6.57,Font=latinmodern-math.otf,St
+yle=MathScriptScript}},BoldFont={latinmodern-math.otf}].
+(fontspec)              
+(fontspec)              This font family consists of the following NFSS
+(fontspec)             series/shapes:
+(fontspec)              
+(fontspec)             - 'normal' (m/n) with NFSS spec.:
+(fontspec)             <9.3075->s*[0.9999967668407183]"[latinmodern-math.otf]/O
+T:script=math;language=dflt;"<6.57-9.3075>s*[0.9999967668407183]"[latinmodern-m
+ath.otf]/OT:script=math;language=dflt;+ssty=0;"<-6.57>s*[0.9999967668407183]"[l
+atinmodern-math.otf]/OT:script=math;language=dflt;+ssty=1;"
+(fontspec)             - 'small caps'  (m/sc) with NFSS spec.: 
+(fontspec)             - 'bold' (b/n) with NFSS spec.:
+(fontspec)             <->s*[0.9999967668407183]"[latinmodern-math.otf]/OT:scri
+pt=math;language=dflt;"
+(fontspec)             - 'bold small caps'  (b/sc) with NFSS spec.: 
+
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(1)/m/n' will be
+(Font)              scaled to size 10.95pt on input line 229.
+LaTeX Font Info:    Encoding `OT1' has changed to `TU' for symbol font
+(Font)              `operators' in the math version `normal' on input line 229.
+
+LaTeX Font Info:    Overwriting symbol font `operators' in version `normal'
+(Font)                  OT1/lmr/m/n --> TU/latinmodern-math.otf(1)/m/n on input
+ line 229.
+LaTeX Font Info:    Encoding `OT1' has changed to `TU' for symbol font
+(Font)              `operators' in the math version `bold' on input line 229.
+LaTeX Font Info:    Overwriting symbol font `operators' in version `bold'
+(Font)                  OT1/lmr/bx/n --> TU/latinmodern-math.otf(1)/b/n on inpu
+t line 229.
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 1.000096766517402.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 1.000096766517402.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 1.000096766517402.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 1.000096766517402.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 1.000096766517402.
+
+
+Package fontspec Info: Font family 'latinmodern-math.otf(2)' created for font
+(fontspec)             'latinmodern-math.otf' with options
+(fontspec)             [Scale=MatchLowercase,BoldItalicFont={},ItalicFont={},Sm
+allCapsFont={},Script=Math,SizeFeatures={{Size=9.3075-},{Size=6.57-9.3075,Font=
+latinmodern-math.otf,Style=MathScript},{Size=-6.57,Font=latinmodern-math.otf,St
+yle=MathScriptScript}},BoldFont={latinmodern-math.otf},ScaleAgain=1.0001,FontAd
+justment={\fontdimen
+(fontspec)             8\font =7.41315pt\relax \fontdimen 9\font
+(fontspec)             =4.3143pt\relax \fontdimen 10\font =4.8618pt\relax
+(fontspec)             \fontdimen 11\font =7.5117pt\relax \fontdimen 12\font
+(fontspec)             =3.77776pt\relax \fontdimen 13\font =3.97485pt\relax
+(fontspec)             \fontdimen 14\font =3.97485pt\relax \fontdimen 15\font
+(fontspec)             =3.16455pt\relax \fontdimen 16\font =2.70465pt\relax
+(fontspec)             \fontdimen 17\font =2.70465pt\relax \fontdimen 18\font
+(fontspec)             =2.7375pt\relax \fontdimen 19\font =2.19pt\relax
+(fontspec)             \fontdimen 22\font =2.7375pt\relax \fontdimen 20\font
+(fontspec)             =0pt\relax \fontdimen 21\font =0pt\relax }].
+(fontspec)              
+(fontspec)              This font family consists of the following NFSS
+(fontspec)             series/shapes:
+(fontspec)              
+(fontspec)             - 'normal' (m/n) with NFSS spec.:
+(fontspec)             <9.3075->s*[1.000096766517402]"[latinmodern-math.otf]/OT
+:script=math;language=dflt;"<6.57-9.3075>s*[1.000096766517402]"[latinmodern-mat
+h.otf]/OT:script=math;language=dflt;+ssty=0;"<-6.57>s*[1.000096766517402]"[lati
+nmodern-math.otf]/OT:script=math;language=dflt;+ssty=1;"
+(fontspec)             - 'small caps'  (m/sc) with NFSS spec.: 
+(fontspec)             and font adjustment code:
+(fontspec)             \fontdimen 8\font =7.41315pt\relax \fontdimen 9\font
+(fontspec)             =4.3143pt\relax \fontdimen 10\font =4.8618pt\relax
+(fontspec)             \fontdimen 11\font =7.5117pt\relax \fontdimen 12\font
+(fontspec)             =3.77776pt\relax \fontdimen 13\font =3.97485pt\relax
+(fontspec)             \fontdimen 14\font =3.97485pt\relax \fontdimen 15\font
+(fontspec)             =3.16455pt\relax \fontdimen 16\font =2.70465pt\relax
+(fontspec)             \fontdimen 17\font =2.70465pt\relax \fontdimen 18\font
+(fontspec)             =2.7375pt\relax \fontdimen 19\font =2.19pt\relax
+(fontspec)             \fontdimen 22\font =2.7375pt\relax \fontdimen 20\font
+(fontspec)             =0pt\relax \fontdimen 21\font =0pt\relax 
+(fontspec)             - 'bold' (b/n) with NFSS spec.:
+(fontspec)             <->s*[1.000096766517402]"[latinmodern-math.otf]/OT:scrip
+t=math;language=dflt;"
+(fontspec)             - 'bold small caps'  (b/sc) with NFSS spec.: 
+(fontspec)             and font adjustment code:
+(fontspec)             \fontdimen 8\font =7.41315pt\relax \fontdimen 9\font
+(fontspec)             =4.3143pt\relax \fontdimen 10\font =4.8618pt\relax
+(fontspec)             \fontdimen 11\font =7.5117pt\relax \fontdimen 12\font
+(fontspec)             =3.77776pt\relax \fontdimen 13\font =3.97485pt\relax
+(fontspec)             \fontdimen 14\font =3.97485pt\relax \fontdimen 15\font
+(fontspec)             =3.16455pt\relax \fontdimen 16\font =2.70465pt\relax
+(fontspec)             \fontdimen 17\font =2.70465pt\relax \fontdimen 18\font
+(fontspec)             =2.7375pt\relax \fontdimen 19\font =2.19pt\relax
+(fontspec)             \fontdimen 22\font =2.7375pt\relax \fontdimen 20\font
+(fontspec)             =0pt\relax \fontdimen 21\font =0pt\relax 
+
+LaTeX Font Info:    Encoding `OMS' has changed to `TU' for symbol font
+(Font)              `symbols' in the math version `normal' on input line 229.
+LaTeX Font Info:    Overwriting symbol font `symbols' in version `normal'
+(Font)                  OMS/lmsy/m/n --> TU/latinmodern-math.otf(2)/m/n on inpu
+t line 229.
+LaTeX Font Info:    Encoding `OMS' has changed to `TU' for symbol font
+(Font)              `symbols' in the math version `bold' on input line 229.
+LaTeX Font Info:    Overwriting symbol font `symbols' in version `bold'
+(Font)                  OMS/lmsy/b/n --> TU/latinmodern-math.otf(2)/b/n on inpu
+t line 229.
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9998967671640342.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9998967671640342.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9998967671640342.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9998967671640342.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9999967668407183.
+
+
+Package fontspec Info: latinmodern-math scale = 0.9998967671640342.
+
+
+Package fontspec Info: Font family 'latinmodern-math.otf(3)' created for font
+(fontspec)             'latinmodern-math.otf' with options
+(fontspec)             [Scale=MatchLowercase,BoldItalicFont={},ItalicFont={},Sm
+allCapsFont={},Script=Math,SizeFeatures={{Size=9.3075-},{Size=6.57-9.3075,Font=
+latinmodern-math.otf,Style=MathScript},{Size=-6.57,Font=latinmodern-math.otf,St
+yle=MathScriptScript}},BoldFont={latinmodern-math.otf},ScaleAgain=0.9999,FontAd
+justment={\fontdimen
+(fontspec)             8\font =0.438pt\relax \fontdimen 9\font =2.19pt\relax
+(fontspec)             \fontdimen 10\font =1.82864pt\relax \fontdimen 11\font
+(fontspec)             =1.21545pt\relax \fontdimen 12\font =6.56999pt\relax
+(fontspec)             \fontdimen 13\font =0pt\relax }].
+(fontspec)              
+(fontspec)              This font family consists of the following NFSS
+(fontspec)             series/shapes:
+(fontspec)              
+(fontspec)             - 'normal' (m/n) with NFSS spec.:
+(fontspec)             <9.3075->s*[0.9998967671640342]"[latinmodern-math.otf]/O
+T:script=math;language=dflt;"<6.57-9.3075>s*[0.9998967671640342]"[latinmodern-m
+ath.otf]/OT:script=math;language=dflt;+ssty=0;"<-6.57>s*[0.9998967671640342]"[l
+atinmodern-math.otf]/OT:script=math;language=dflt;+ssty=1;"
+(fontspec)             - 'small caps'  (m/sc) with NFSS spec.: 
+(fontspec)             and font adjustment code:
+(fontspec)             \fontdimen 8\font =0.438pt\relax \fontdimen 9\font
+(fontspec)             =2.19pt\relax \fontdimen 10\font =1.82864pt\relax
+(fontspec)             \fontdimen 11\font =1.21545pt\relax \fontdimen 12\font
+(fontspec)             =6.56999pt\relax \fontdimen 13\font =0pt\relax 
+(fontspec)             - 'bold' (b/n) with NFSS spec.:
+(fontspec)             <->s*[0.9998967671640342]"[latinmodern-math.otf]/OT:scri
+pt=math;language=dflt;"
+(fontspec)             - 'bold small caps'  (b/sc) with NFSS spec.: 
+(fontspec)             and font adjustment code:
+(fontspec)             \fontdimen 8\font =0.438pt\relax \fontdimen 9\font
+(fontspec)             =2.19pt\relax \fontdimen 10\font =1.82864pt\relax
+(fontspec)             \fontdimen 11\font =1.21545pt\relax \fontdimen 12\font
+(fontspec)             =6.56999pt\relax \fontdimen 13\font =0pt\relax 
+
+LaTeX Font Info:    Encoding `OMX' has changed to `TU' for symbol font
+(Font)              `largesymbols' in the math version `normal' on input line 2
+29.
+LaTeX Font Info:    Overwriting symbol font `largesymbols' in version `normal'
+(Font)                  OMX/lmex/m/n --> TU/latinmodern-math.otf(3)/m/n on inpu
+t line 229.
+LaTeX Font Info:    Encoding `OMX' has changed to `TU' for symbol font
+(Font)              `largesymbols' in the math version `bold' on input line 229
+.
+LaTeX Font Info:    Overwriting symbol font `largesymbols' in version `bold'
+(Font)                  OMX/lmex/m/n --> TU/latinmodern-math.otf(3)/b/n on inpu
+t line 229.
+LaTeX Info: Redefining \microtypecontext on input line 229.
+Package microtype Info: Applying patch `item' on input line 229.
+Package microtype Info: Applying patch `toc' on input line 229.
+Package microtype Info: Applying patch `eqnum' on input line 229.
+Package microtype Info: Applying patch `footnote' on input line 229.
+Package microtype Info: Character protrusion enabled (level 2).
+Package microtype Info: Using protrusion set `basicmath'.
+Package microtype Info: No adjustment of tracking.
+Package microtype Info: No adjustment of spacing.
+Package microtype Info: No adjustment of kerning.
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/microtype/mt-LatinModernRoman.cfg
+File: mt-LatinModernRoman.cfg 2021/02/21 v1.1 microtype config. file: Latin Mod
+ern Roman (RS)
+)
+Package hyperref Info: Link coloring ON on input line 229.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/hyperref/nameref.sty
+Package: nameref 2021-04-02 v2.47 Cross-referencing by name of section
+(/usr/local/texlive/2022/texmf-dist/tex/latex/refcount/refcount.sty
+Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
+)
+(/usr/local/texlive/2022/texmf-dist/tex/generic/gettitlestring/gettitlestring.s
+ty
+Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
+)
+\c@section@level=\count353
+)
+LaTeX Info: Redefining \ref on input line 229.
+LaTeX Info: Redefining \pageref on input line 229.
+LaTeX Info: Redefining \nameref on input line 229.
+
+Package hyperref Warning: Rerun to get /PageLabels entry.
+
+Package caption Info: Begin \AtBeginDocument code.
+Package caption Info: float package is loaded.
+Package caption Info: End \AtBeginDocument code.
+Package microtype Info: Loading generic protrusion settings for font family
+(microtype)             `lmss' (encoding: TU).
+(microtype)             For optimal results, create family-specific settings.
+(microtype)             See the microtype manual for details.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(1)/m/n' will be
+(Font)              scaled to size 14.4pt on input line 231.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(1)/m/n' will be
+(Font)              scaled to size 10.0pt on input line 231.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(1)/m/n' will be
+(Font)              scaled to size 7.0pt on input line 231.
+LaTeX Font Info:    Trying to load font information for OML+lmm on input line 2
+31.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/lm/omllmm.fd
+File: omllmm.fd 2015/05/01 v1.6.1 Font defs for Latin Modern
+)
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(2)/m/n' will be
+(Font)              scaled to size 14.4013pt on input line 231.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(2)/m/n' will be
+(Font)              scaled to size 10.00092pt on input line 231.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(2)/m/n' will be
+(Font)              scaled to size 7.00064pt on input line 231.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(3)/m/n' will be
+(Font)              scaled to size 14.39845pt on input line 231.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(3)/m/n' will be
+(Font)              scaled to size 9.99893pt on input line 231.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(3)/m/n' will be
+(Font)              scaled to size 6.99925pt on input line 231.
+LaTeX Font Info:    Trying to load font information for U+msa on input line 231
+.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/amsfonts/umsa.fd
+File: umsa.fd 2013/01/14 v3.01 AMS symbols A
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/microtype/mt-msa.cfg
+File: mt-msa.cfg 2006/02/04 v1.1 microtype config. file: AMS symbols (a) (RS)
+)
+LaTeX Font Info:    Trying to load font information for U+msb on input line 231
+.
+(/usr/local/texlive/2022/texmf-dist/tex/latex/amsfonts/umsb.fd
+File: umsb.fd 2013/01/14 v3.01 AMS symbols B
+) (/usr/local/texlive/2022/texmf-dist/tex/latex/microtype/mt-msb.cfg
+File: mt-msb.cfg 2005/06/01 v1.0 microtype config. file: AMS symbols (b) (RS)
+) [1
+
+
+
+]
+Package tocbasic Info: character protrusion at toc deactivated on input line 23
+6.
+\tf@toc=\write5
+\openout5 = `index.toc'.
+
+[2
+
+]
+Underfull \hbox (badness 1910) in paragraph at lines 259--261
+[]\TU/lmr/m/n/10.95 If you spot any typos or would like to suggest any changes,
+ please email us at
+ []
+
+[3
+
+]
+chapter 1.
+
+Class scrreprt Warning: \float@addtolists detected!
+(scrreprt)              Implementation of \float@addtolist became
+(scrreprt)              deprecated in KOMA-Script v3.01 2008/11/14 and
+(scrreprt)              has been replaced by several more flexible
+(scrreprt)              features of package `tocbasic`.
+(scrreprt)              Since Version 3.12 support for deprecated
+(scrreprt)              \float@addtolist interface has been
+(scrreprt)              restricted to only some of the KOMA-Script
+(scrreprt)              features and been removed from others.
+(scrreprt)              Loading of package `scrhack' may help to
+(scrreprt)              avoid this warning, if you are using a
+(scrreprt)              a package that still implements the
+(scrreprt)              deprecated \float@addtolist interface.
+
+(/usr/local/texlive/2022/texmf-dist/tex/latex/microtype/mt-TU-empty.cfg
+File: mt-TU-empty.cfg 2021/06/22 v1.1 microtype config. file: fonts with nonsta
+ndard glyph set (RS)
+)
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(1)/m/n' will be
+(Font)              scaled to size 7.665pt on input line 287.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(1)/m/n' will be
+(Font)              scaled to size 5.475pt on input line 287.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(2)/m/n' will be
+(Font)              scaled to size 10.95099pt on input line 287.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(2)/m/n' will be
+(Font)              scaled to size 7.66568pt on input line 287.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(2)/m/n' will be
+(Font)              scaled to size 5.4755pt on input line 287.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(3)/m/n' will be
+(Font)              scaled to size 10.94882pt on input line 287.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(3)/m/n' will be
+(Font)              scaled to size 7.66417pt on input line 287.
+LaTeX Font Info:    Font shape `TU/latinmodern-math.otf(3)/m/n' will be
+(Font)              scaled to size 5.47441pt on input line 287.
+[4
+
+] [5] [6] [7]
+Underfull \hbox (badness 10000) in paragraph at lines 559--561
+
+ []
+
+
+Underfull \hbox (badness 10000) in paragraph at lines 568--570
+
+ []
+
+[8]
+chapter 2.
+[9
+
+]
+LaTeX Font Info:    Font shape `TU/lmtt/bx/n' in size <14.4> not available
+(Font)              Font shape `TU/lmtt/b/n' tried instead on input line 673.
+LaTeX Font Info:    Font shape `TU/lmtt/bx/n' in size <10.95> not available
+(Font)              Font shape `TU/lmtt/b/n' tried instead on input line 692.
+[10] [11] [12]
+LaTeX Font Info:    Font shape `TU/lmtt/bx/n' in size <12> not available
+(Font)              Font shape `TU/lmtt/b/n' tried instead on input line 918.
+[13]
+Missing character: There is no   (U+2003) in font [lmroman10-regular]:mapping=t
+ex-text;!
+
+Overfull \hbox (12.53052pt too wide) in alignment at lines 979--994
+ [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.1 on input line 994.
+
+[14]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.2 on input line 1029.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.3 on input line 1052.
+
+[15]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.4 on input line 1083.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.5 on input line 1106.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.6 on input line 1148.
+
+[16]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.7 on input line 1166.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.8 on input line 1190.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.9 on input line 1229.
+
+[17]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.10 on input line 1266.
+
+[18]
+Overfull \hbox (36.13486pt too wide) in paragraph at lines 1330--1330
+[]\TU/lmtt/m/n/10.95 Index([[]index[], []Candidate[], []Year[], []Popular vote[
+], []Result[], []%[]], dtype=[]object[])[] 
+ []
+
+[19]
+Overfull \hbox (1.58052pt too wide) in alignment at lines 1415--1424
+ [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.11 on input line 1424.
+
+[20]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.12 on input line 1448.
+
+[21]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.13 on input line 1531.
+
+
+Overfull \hbox (1.58052pt too wide) in alignment at lines 1549--1557
+ [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.14 on input line 1557.
+
+[22]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.15 on input line 1585.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.16 on input line 1614.
+
+
+Overfull \hbox (1.58052pt too wide) in alignment at lines 1630--1638
+ [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.17 on input line 1638.
+
+[23]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.18 on input line 1709.
+
+[24]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.19 on input line 1736.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.20 on input line 1765.
+
+[25]
+Overfull \hbox (1.58052pt too wide) in alignment at lines 1824--1832
+ [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.21 on input line 1832.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 2.22 on input line 1862.
+
+[26] [27]
+chapter 3.
+[28
+
+]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.1 on input line 1984.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.2 on input line 2026.
+
+[29]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.3 on input line 2048.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.4 on input line 2080.
+
+[30]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.5 on input line 2146.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.6 on input line 2163.
+
+[31]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.7 on input line 2191.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.8 on input line 2225.
+
+[32]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.9 on input line 2255.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.10 on input line 2300.
+
+[33]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.11 on input line 2341.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.12 on input line 2379.
+
+[34]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.13 on input line 2406.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.14 on input line 2431.
+
+[35]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.15 on input line 2460.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.16 on input line 2494.
+
+[36] [37]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.17 on input line 2643.
+
+[38]
+Underfull \hbox (badness 1845) in paragraph at lines 2671--2674
+[]\TU/lmr/m/n/10.95 By default, \TU/lmtt/m/n/10.95 .sample() \TU/lmr/m/n/10.95 
+selects entries \TU/lmr/m/it/10.95 without \TU/lmr/m/n/10.95 replacement. Pass 
+in the argument
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.18 on input line 2690.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.19 on input line 2714.
+
+[39]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.20 on input line 2734.
+
+[40]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 3.21 on input line 2816.
+
+[41]
+chapter 4.
+[42
+
+]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.1 on input line 2931.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.2 on input line 2962.
+
+[43]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.3 on input line 2987.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.4 on input line 3012.
+
+[44]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.5 on input line 3039.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.6 on input line 3076.
+
+[45]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.7 on input line 3101.
+
+[46]
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 3160--3160
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/2718070104.py:1: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 3162--3162
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function sum> is currently
+ using DataFrameGroupBy.sum. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "sum" instea
+d.[] 
+ []
+
+File: pandas_3/images/agg.png Graphic file (type bmp)
+<pandas_3/images/agg.png>
+[47]
+Overfull \hbox (93.62231pt too wide) in paragraph at lines 3209--3209
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/86785752.py:1: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 3211--3211
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function min> is currently
+ using DataFrameGroupBy.min. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "min" instea
+d.[] 
+ []
+
+
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 3236--3236
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/3032256904.py:1: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 3238--3238
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function max> is currently
+ using DataFrameGroupBy.max. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "max" instea
+d.[] 
+ []
+
+
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 3264--3264
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/1958904241.py:2: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 3266--3266
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function sum> is currently
+ using DataFrameGroupBy.sum. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "sum" instea
+d.[] 
+ []
+
+[48]
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 3313--3313
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/3244314896.py:2: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 3315--3315
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function min> is currently
+ using DataFrameGroupBy.min. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "min" instea
+d.[] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.12 on input line 3331.
+
+[49]
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 3341--3341
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/3805876622.py:2: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 3343--3343
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function max> is currently
+ using DataFrameGroupBy.max. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "max" instea
+d.[] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.13 on input line 3359.
+
+
+Overfull \hbox (99.37106pt too wide) in paragraph at lines 3373--3373
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/308986604.py:2: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (858.20592pt too wide) in paragraph at lines 3375--3375
+[]\TU/lmtt/m/n/10.95 The provided callable <function mean at 0x103c75360> is cu
+rrently using DataFrameGroupBy.mean. In a future version of pandas, the provide
+d callable will be used directly. To keep current behavior pass the string "mea
+n" instead.[] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.14 on input line 3391.
+
+[50]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.15 on input line 3447.
+
+File: pandas_3/images/first.png Graphic file (type bmp)
+<pandas_3/images/first.png>
+[51]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.16 on input line 3486.
+
+
+Overfull \hbox (99.37106pt too wide) in paragraph at lines 3507--3507
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/390646742.py:1: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 3509--3509
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function sum> is currently
+ using DataFrameGroupBy.sum. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "sum" instea
+d.[] 
+ []
+
+[52]
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 3540--3540
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/4066413905.py:2: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 3542--3542
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function sum> is currently
+ using DataFrameGroupBy.sum. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "sum" instea
+d.[] 
+ []
+
+[53] [54]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.18 on input line 3653.
+
+
+Overfull \hbox (68.74403pt too wide) in paragraph at lines 3664--3669
+\TU/lmr/m/n/10.95 to apply our function to the table as a whole by doing \TU/lm
+tt/m/n/10.95 f_babynames.groupby("Name").agg(ratio_to_peak)\TU/lmr/m/n/10.95 ,
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.19 on input line 3708.
+
+[55]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.20 on input line 3735.
+
+[56]
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 3782--3782
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/1912269730.py:1: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 3784--3784
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function sum> is currently
+ using DataFrameGroupBy.sum. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "sum" instea
+d.[] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.21 on input line 3800.
+
+
+Overfull \hbox (1.58052pt too wide) in alignment at lines 3823--3832
+ [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.22 on input line 3832.
+
+[57]
+Overfull \hbox (2375.87561pt too wide) in paragraph at lines 3861--3861
+[]\TU/lmtt/m/n/10.95 {[]American[]: [22, 126], []American Independent[]: [115, 
+119, 124], []Anti-Masonic[]: [6], []Anti-Monopoly[]: [38], []Citizens[]: [127],
+ []Communist[]: [89], []Constitution[]: [160, 164, 172], []Constitutional Union
+[]: [24], []Democratic[]: [2, 4, 8, 10, 13, 14, 17, 20, 28, 29, 34, 37, 39, 45,
+ 47, 52, 55, 57, 64, 70, 74, 77, 81, 83, 86, 91, 94, 97, 100, 105, 108, 111, 11
+4, 116, 118, 123, 129, 134, 137, 140, 144, 151, 158, 162, 168, 176, 178], []Dem
+ocratic-Republican[]: [0, 1], []Dixiecrat[]: [103], []Farmer–
+ []
+
+
+Overfull \hbox (5031.79755pt too wide) in paragraph at lines 3861--3861
+\TU/lmtt/m/n/10.95 Labor[]: [78], []Free Soil[]: [15, 18], []Green[]: [149, 155
+, 156, 165, 170, 177, 181], []Greenback[]: [35], []Independent[]: [121, 130, 14
+3, 161, 167, 174], []Liberal Republican[]: [31], []Libertarian[]: [125, 128, 13
+2, 138, 139, 146, 153, 159, 163, 169, 175, 180], []National Democratic[]: [50],
+ []National Republican[]: [3, 5], []National Union[]: [27], []Natural Law[]: [1
+48], []New Alliance[]: [136], []Northern Democratic[]: [26], []Populist[]: [48,
+ 61, 141], []Progressive[]: [68, 82, 101, 107], []Prohibition[]: [41, 44, 49, 5
+1, 54, 59, 63, 67, 73, 75, 99], []Reform[]: [150, 154], []Republican[]: [21, 23
+, 30, 32, 33, 36, 40, 43, 46, 53, 56, 60, 65, 69, 72, 79, 80, 84, 87, 90, 96, 9
+8, 104, 106, 109, 112, 113, 117, 120, 122, 131, 133, 135, 142, 145, 152, 157, 1
+66, 171, 173, 179], []Socialist[]: [58, 62, 66, 71, 76, 85, 88, 92, 95, 102], [
+]Southern Democratic[]: [25], []States[] Rights[]: [110], []Taxpayers[]: [147],
+ []Union[]: [93], []Union Labor[]: [42], []Whig[]: [7, 9, 11, 12, 16, 19]}[] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.23 on input line 3887.
+
+[58]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.24 on input line 3949.
+
+[59] [60]
+Overfull \hbox (0.72641pt too wide) in alignment at lines 4101--4115
+ [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.26 on input line 4115.
+
+[61]
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 4164--4164
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/4278286395.py:1: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 4166--4166
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function max> is currently
+ using DataFrameGroupBy.max. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "max" instea
+d.[] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.27 on input line 4189.
+
+[62]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.28 on input line 4250.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.29 on input line 4281.
+
+[63]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.30 on input line 4325.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.31 on input line 4348.
+
+[64]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.32 on input line 4382.
+
+
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 4393--4393
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/3186035650.py:3: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 4395--4395
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function sum> is currently
+ using DataFrameGroupBy.sum. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "sum" instea
+d.[] 
+ []
+
+
+Overfull \hbox (70.24188pt too wide) in alignment at lines 4405--4412
+ [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.33 on input line 4412.
+
+[65]
+Overfull \hbox (105.11981pt too wide) in paragraph at lines 4454--4454
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/2548053048.py:3: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (840.95969pt too wide) in paragraph at lines 4456--4456
+[]\TU/lmtt/m/n/10.95 The provided callable <function sum at 0x103c74160> is cur
+rently using DataFrameGroupBy.sum. In a future version of pandas, the provided 
+callable will be used directly. To keep current behavior pass the string "sum" 
+instead.[] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.34 on input line 4472.
+
+[66]
+Overfull \hbox (99.37106pt too wide) in paragraph at lines 4517--4517
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57880/970182367.py:1: FutureWarning:[] 
+ []
+
+
+Overfull \hbox (806.4672pt too wide) in paragraph at lines 4519--4519
+[]\TU/lmtt/m/n/10.95 The provided callable <built-in function max> is currently
+ using DataFrameGroupBy.max. In a future version of pandas, the provided callab
+le will be used directly. To keep current behavior pass the string "max" instea
+d.[] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.35 on input line 4539.
+
+[67]
+Overfull \hbox (1.58052pt too wide) in alignment at lines 4571--4580
+ [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.36 on input line 4580.
+
+
+Overfull \hbox (67.95822pt too wide) in alignment at lines 4605--4616
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.37 on input line 4616.
+
+[68]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.38 on input line 4638.
+
+
+Overfull \hbox (167.43184pt too wide) in alignment at lines 4658--4663
+ [] [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (167.43184pt too wide) in alignment at lines 4663--4665
+ [] [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (354.60016pt too wide) in alignment at lines 4665--4676
+ [] [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 4.39 on input line 4676.
+
+[69] [70]
+chapter 5.
+[71
+
+] [72]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.1 on input line 4833.
+
+[73]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.2 on input line 4916.
+
+[74]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.3 on input line 4983.
+
+[75] [76]
+Overfull \hbox (13.13986pt too wide) in paragraph at lines 5074--5074
+[]\TU/lmtt/m/n/10.95 -rw-r--r--  1 jianingding21  staff   114K Aug 27 03:33 dat
+a/confirmed-cases.json[] 
+ []
+
+[77]
+Overfull \hbox (2646.0668pt too wide) in paragraph at lines 5190--5190
+[]\TU/lmtt/m/n/10.95 dict_keys([[]id[], []name[], []assetType[], []attribution[
+], []averageRating[], []category[], []createdAt[], []description[], []displayTy
+pe[], []downloadCount[], []hideFromCatalog[], []hideFromDataJson[], []newBacken
+d[], []numberOfComments[], []oid[], []provenance[], []publicationAppendEnabled[
+], []publicationDate[], []publicationGroup[], []publicationStage[], []rowsUpdat
+edAt[], []rowsUpdatedBy[], []tableId[], []totalTimesRated[], []viewCount[], []v
+iewLastModified[], []viewType[], []approvals[], []columns[], []grants[], []meta
+data[], []owner[], []query[], []rights[], []tableAuthor[], []tags[], []flags[]]
+)[] 
+ []
+
+[78]
+Overfull \hbox (398.30603pt too wide) in paragraph at lines 5238--5238
+[]\TU/lmtt/m/n/10.95 000 | [[]row-kzbg.v7my-c3y2[], []00000000-0000-0000-0405-C
+B14DE51DAA7[], 0, 1643733903, None, 1643733903, None, []{ }[], []2020-02-28T00:
+00:00[], []1[], []1[]][] 
+ []
+
+
+Overfull \hbox (398.30603pt too wide) in paragraph at lines 5239--5239
+[]\TU/lmtt/m/n/10.95 001 | [[]row-jkyx_9u4r-h2yw[], []00000000-0000-0000-F806-8
+6D0DBE0E17F[], 0, 1643733903, None, 1643733903, None, []{ }[], []2020-02-29T00:
+00:00[], []0[], []1[]][] 
+ []
+
+
+Overfull \hbox (398.30603pt too wide) in paragraph at lines 5240--5240
+[]\TU/lmtt/m/n/10.95 002 | [[]row-qifg_4aug-y3ym[], []00000000-0000-0000-2DCE-4
+D1872F9B216[], 0, 1643733903, None, 1643733903, None, []{ }[], []2020-03-01T00:
+00:00[], []0[], []1[]][] 
+ []
+
+[79]
+Overfull \hbox (177.33301pt too wide) in alignment at lines 5319--5324
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (177.33301pt too wide) in alignment at lines 5324--5326
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (560.65959pt too wide) in alignment at lines 5326--5342
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.4 on input line 5342.
+
+[80]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.5 on input line 5369.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.6 on input line 5389.
+
+[81]
+File: eda/images/variable.png Graphic file (type bmp)
+<eda/images/variable.png>
+[82] [83]
+Overfull \hbox (229.5426pt too wide) in alignment at lines 5528--5533
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (229.5426pt too wide) in alignment at lines 5533--5535
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (848.37073pt too wide) in alignment at lines 5535--5554
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.7 on input line 5554.
+
+
+Overfull \hbox (87.87357pt too wide) in paragraph at lines 5576--5576
+[]\TU/lmtt/m/n/10.95 /var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel
+_57962/874729699.py:1: UserWarning:[] 
+ []
+
+
+Overfull \hbox (530.52725pt too wide) in paragraph at lines 5578--5578
+[]\TU/lmtt/m/n/10.95 Could not infer format, so each element will be parsed ind
+ividually, falling back to []dateutil[]. To ensure parsing is consistent and as
+-expected, please specify a format.[] 
+ []
+
+
+Overfull \hbox (229.5426pt too wide) in alignment at lines 5581--5586
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (229.5426pt too wide) in alignment at lines 5586--5588
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (784.6527pt too wide) in alignment at lines 5588--5607
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.8 on input line 5607.
+
+[84]
+Overfull \hbox (229.5426pt too wide) in alignment at lines 5654--5659
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (229.5426pt too wide) in alignment at lines 5659--5661
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (843.74982pt too wide) in alignment at lines 5661--5682
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.9 on input line 5682.
+
+[85] [86] [87]
+Overfull \hbox (74.12302pt too wide) in alignment at lines 5874--5879
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (74.12302pt too wide) in alignment at lines 5879--5881
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (98.26779pt too wide) in alignment at lines 5881--5888
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.10 on input line 5888.
+
+[88]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.11 on input line 5922.
+
+
+Overfull \hbox (187.21458pt too wide) in alignment at lines 5945--5950
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (187.21458pt too wide) in alignment at lines 5950--5952
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (192.6896pt too wide) in alignment at lines 5952--5958
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.12 on input line 5958.
+
+[89] [90]
+Overfull \hbox (187.21458pt too wide) in alignment at lines 6032--6037
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (187.21458pt too wide) in alignment at lines 6037--6039
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (192.6896pt too wide) in alignment at lines 6039--6045
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.13 on input line 6045.
+
+
+Overfull \hbox (187.21458pt too wide) in alignment at lines 6075--6080
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (187.21458pt too wide) in alignment at lines 6080--6082
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (192.6896pt too wide) in alignment at lines 6082--6088
+ [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.14 on input line 6088.
+
+[91]
+Underfull \hbox (badness 1205) in paragraph at lines 6098--6105
+[]\TU/lmr/m/n/10.95 Running the below cells cleans the data. There are a few ne
+w methods here: *
+ []
+
+
+Overfull \hbox (264.21036pt too wide) in alignment at lines 6134--6145
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.15 on input line 6145.
+
+[92]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.16 on input line 6198.
+
+
+Overfull \hbox (841.39346pt too wide) in alignment at lines 6225--6232
+ [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (841.39346pt too wide) in alignment at lines 6232--6234
+ [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (1131.5684pt too wide) in alignment at lines 6234--6251
+ [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+[93]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.17 on input line 6251.
+
+
+Overfull \hbox (288.91461pt too wide) in alignment at lines 6273--6279
+ [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (288.91461pt too wide) in alignment at lines 6279--6281
+ [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (360.08961pt too wide) in alignment at lines 6281--6292
+ [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.18 on input line 6292.
+
+[94]
+Overfull \hbox (423.8174pt too wide) in alignment at lines 6321--6327
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (423.8174pt too wide) in alignment at lines 6327--6329
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (494.9924pt too wide) in alignment at lines 6329--6340
+ [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.19 on input line 6340.
+
+
+Overfull \hbox (693.62297pt too wide) in alignment at lines 6358--6365
+ [] [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (693.62297pt too wide) in alignment at lines 6365--6367
+ [] [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (764.79797pt too wide) in alignment at lines 6367--6378
+ [] [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.20 on input line 6378.
+
+[95]
+Overfull \hbox (603.63707pt too wide) in alignment at lines 6390--6396
+ [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (603.63707pt too wide) in alignment at lines 6396--6398
+ [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (737.78549pt too wide) in alignment at lines 6398--6415
+ [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.21 on input line 6415.
+
+
+Overfull \hbox (175.21458pt too wide) in alignment at lines 6450--6456
+ [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (175.21458pt too wide) in alignment at lines 6456--6458
+ [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (175.21458pt too wide) in alignment at lines 6458--6464
+ [] [] [] [] [] [] [] 
+ []
+
+[96]
+Overfull \hbox (246.73534pt too wide) in alignment at lines 6480--6491
+ [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.23 on input line 6491.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.24 on input line 6513.
+
+
+Overfull \hbox (175.21458pt too wide) in alignment at lines 6524--6530
+ [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (175.21458pt too wide) in alignment at lines 6530--6532
+ [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (175.21458pt too wide) in alignment at lines 6532--6538
+ [] [] [] [] [] [] [] 
+ []
+
+[97]
+Overfull \hbox (246.73534pt too wide) in alignment at lines 6560--6571
+ [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.26 on input line 6571.
+
+
+Overfull \hbox (246.73534pt too wide) in alignment at lines 6597--6608
+ [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.27 on input line 6608.
+
+[98]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.28 on input line 6631.
+
+
+Overfull \hbox (198.92871pt too wide) in alignment at lines 6650--6655
+ [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (198.92871pt too wide) in alignment at lines 6655--6657
+ [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (325.00702pt too wide) in alignment at lines 6657--6667
+ [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.29 on input line 6667.
+
+
+Overfull \hbox (603.63707pt too wide) in alignment at lines 6680--6686
+ [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (603.63707pt too wide) in alignment at lines 6686--6688
+ [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Overfull \hbox (729.71538pt too wide) in alignment at lines 6688--6699
+ [] [] [] [] [] [] [] [] [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.30 on input line 6699.
+
+[99] [100]
+Overfull \hbox (18.88861pt too wide) in paragraph at lines 6773--6773
+[]\TU/lmtt/m/n/10.95 71          |   #            decimal     average   interpo
+lated    trend    #days[] 
+ []
+
+
+Overfull \hbox (7.39111pt too wide) in paragraph at lines 6775--6775
+[]\TU/lmtt/m/n/10.95 73          |   1958   3    1958.208      315.71      315.
+71      314.62     -1[] 
+ []
+
+
+Overfull \hbox (7.39111pt too wide) in paragraph at lines 6776--6776
+[]\TU/lmtt/m/n/10.95 74          |   1958   4    1958.292      317.45      317.
+45      315.29     -1[] 
+ []
+
+
+Overfull \hbox (7.39111pt too wide) in paragraph at lines 6777--6777
+[]\TU/lmtt/m/n/10.95 75          |   1958   5    1958.375      317.50      317.
+50      314.71     -1[] 
+ []
+
+
+Overfull \hbox (7.39111pt too wide) in paragraph at lines 6778--6778
+[]\TU/lmtt/m/n/10.95 76          |   1958   6    1958.458      -99.99      317.
+10      314.85     -1[] 
+ []
+
+
+Overfull \hbox (7.39111pt too wide) in paragraph at lines 6779--6779
+[]\TU/lmtt/m/n/10.95 77          |   1958   7    1958.542      315.86      315.
+86      314.98     -1[] 
+ []
+
+
+Overfull \hbox (7.39111pt too wide) in paragraph at lines 6780--6780
+[]\TU/lmtt/m/n/10.95 78          |   1958   8    1958.625      314.93      314.
+93      315.94     -1[] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.31 on input line 6824.
+
+[101]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.32 on input line 6862.
+
+File: eda/eda_files/figure-pdf/cell-62-output-1.pdf Graphic file (type pdf)
+<use eda/eda_files/figure-pdf/cell-62-output-1.pdf>
+
+Overfull \hbox (10.9664pt too wide) in paragraph at lines 6875--6876
+[][] 
+ []
+
+[102]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.33 on input line 6902.
+
+[103]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.34 on input line 6922.
+
+[104]
+File: eda/eda_files/figure-pdf/cell-67-output-1.pdf Graphic file (type pdf)
+<use eda/eda_files/figure-pdf/cell-67-output-1.pdf>
+[105]
+File: eda/eda_files/figure-pdf/cell-68-output-1.pdf Graphic file (type pdf)
+<use eda/eda_files/figure-pdf/cell-68-output-1.pdf>
+
+Overfull \hbox (10.954pt too wide) in paragraph at lines 7035--7036
+[][] 
+ []
+
+[106] [107]
+File: eda/eda_files/figure-pdf/cell-69-output-1.pdf Graphic file (type pdf)
+<use eda/eda_files/figure-pdf/cell-69-output-1.pdf>
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.35 on input line 7103.
+
+[108]
+File: eda/eda_files/figure-pdf/cell-71-output-1.pdf Graphic file (type pdf)
+<use eda/eda_files/figure-pdf/cell-71-output-1.pdf>
+
+Overfull \hbox (10.9664pt too wide) in paragraph at lines 7134--7135
+[][] 
+ []
+
+[109]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.36 on input line 7166.
+
+[110]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.37 on input line 7188.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 5.38 on input line 7232.
+
+[111]
+File: eda/eda_files/figure-pdf/cell-75-output-1.pdf Graphic file (type pdf)
+<use eda/eda_files/figure-pdf/cell-75-output-1.pdf>
+
+Overfull \hbox (10.9464pt too wide) in paragraph at lines 7268--7269
+[][] 
+ []
+
+[112]
+File: eda/eda_files/figure-pdf/cell-76-output-1.pdf Graphic file (type pdf)
+<use eda/eda_files/figure-pdf/cell-76-output-1.pdf>
+
+Overfull \hbox (10.94781pt too wide) in paragraph at lines 7293--7294
+[][] 
+ []
+
+[113] [114]
+File: eda/eda_files/figure-pdf/cell-77-output-1.pdf Graphic file (type pdf)
+<use eda/eda_files/figure-pdf/cell-77-output-1.pdf>
+
+Overfull \hbox (10.94781pt too wide) in paragraph at lines 7339--7340
+[][] 
+ []
+
+[115] [116]
+chapter 6.
+[117
+
+]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.2 on input line 7598.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.3 on input line 7611.
+
+[118]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.4 on input line 7671.
+
+[119]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.5 on input line 7684.
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.6 on input line 7730.
+
+[120]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.7 on input line 7744.
+
+
+Overfull \hbox (312.07478pt too wide) in paragraph at lines 7764--7764
+[]\TU/lmtt/m/n/10.95 [[]169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /s
+tat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n[],[] 
+
+ []
+
+
+Overfull \hbox (427.04974pt too wide) in paragraph at lines 7765--7765
+[] []\TU/lmtt/m/n/10.95 193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat
+141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/s
+ession.html"\n[],[] 
+ []
+
+
+Overfull \hbox (168.35606pt too wide) in paragraph at lines 7766--7766
+[] []\TU/lmtt/m/n/10.95 169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /s
+tat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"\n[]][] 
+ []
+
+[121]
+Overfull \hbox (4.39713pt too wide) in paragraph at lines 7901--7901
+\TU/lmr/m/n/10.95 ABBBBBBA|  
+ []
+
+
+Overfull \hbox (13.52943pt too wide) in paragraph at lines 7904--7904
+\TU/lmr/m/n/10.95 ABABABABA|  
+ []
+
+[122] [123] [124]
+Underfull \hbox (badness 10000) in paragraph at lines 8091--8093
+
+ []
+
+
+Overfull \hbox (6.13503pt too wide) in paragraph at lines 8143--8145
+\TU/lmtt/m/n/10.95 non-whitespace|  
+ []
+
+
+Overfull \hbox (6.55809pt too wide) in paragraph at lines 8146--8146
+[]|\TU/lmr/m/n/10.95 PEPPERS3982 
+ []
+
+[125] [126]
+Overfull \hbox (300.57729pt too wide) in paragraph at lines 8213--8213
+[][]\TU/lmtt/m/n/10.95 169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /st
+at141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n[][] 
+ []
+
+[127]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.12 on input line 8344.
+
+[128]
+Overfull \hbox (40.27951pt too wide) in paragraph at lines 8386--8388
+\TU/lmtt/m/n/10.95 pandas \TU/lmr/m/n/10.95 similarily provides extraction func
+tionality on a \TU/lmtt/m/n/10.95 Series \TU/lmr/m/n/10.95 of data: \TU/lmtt/m/
+n/10.95 ser.str.findall(pattern) 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.13 on input line 8417.
+
+[129]
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.14 on input line 8461.
+
+
+Overfull \hbox (126.77437pt too wide) in alignment at lines 8476--8481
+ [] [] [] [] [] 
+ []
+
+
+Package longtable Warning: Column widths have changed
+(longtable)                in table 6.15 on input line 8481.
+
+[130]
+Overfull \hbox (300.57729pt too wide) in paragraph at lines 8549--8549
+[][]\TU/lmtt/m/n/10.95 169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /st
+at141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n[][] 
+ []
+
+[131]
+! Undefined control sequence.
+l.8612 ...n your own character classes with \d, \w
+                                                  , \s, 
+Here is how much of TeX's memory you used:
+ 39096 strings out of 476179
+ 800155 string characters out of 5809544
+ 1412163 words of memory out of 5000000
+ 59251 multiletter control sequences out of 15000+600000
+ 475754 words of font info for 86 fonts, out of 8000000 for 9000
+ 1348 hyphenation exceptions out of 8191
+ 108i,8n,121p,10600b,942s stack positions out of 10000i,1000n,20000p,200000b,200000s
+
+Output written on index.pdf (131 pages).
diff --git a/index.pdf b/index.pdf
new file mode 100644
index 000000000..78c45eac5
Binary files /dev/null and b/index.pdf differ
diff --git a/index.tex b/index.tex
new file mode 100644
index 000000000..adf28f446
--- /dev/null
+++ b/index.tex
@@ -0,0 +1,25896 @@
+% Options for packages loaded elsewhere
+\PassOptionsToPackage{unicode}{hyperref}
+\PassOptionsToPackage{hyphens}{url}
+\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
+%
+\documentclass[
+  letterpaper,
+  DIV=11,
+  numbers=noendperiod]{scrreprt}
+
+\usepackage{amsmath,amssymb}
+\usepackage{iftex}
+\ifPDFTeX
+  \usepackage[T1]{fontenc}
+  \usepackage[utf8]{inputenc}
+  \usepackage{textcomp} % provide euro and other symbols
+\else % if luatex or xetex
+  \usepackage{unicode-math}
+  \defaultfontfeatures{Scale=MatchLowercase}
+  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
+\fi
+\usepackage{lmodern}
+\ifPDFTeX\else  
+    % xetex/luatex font selection
+\fi
+% Use upquote if available, for straight quotes in verbatim environments
+\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
+\IfFileExists{microtype.sty}{% use microtype if available
+  \usepackage[]{microtype}
+  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
+}{}
+\makeatletter
+\@ifundefined{KOMAClassName}{% if non-KOMA class
+  \IfFileExists{parskip.sty}{%
+    \usepackage{parskip}
+  }{% else
+    \setlength{\parindent}{0pt}
+    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
+}{% if KOMA class
+  \KOMAoptions{parskip=half}}
+\makeatother
+\usepackage{xcolor}
+\setlength{\emergencystretch}{3em} % prevent overfull lines
+\setcounter{secnumdepth}{5}
+% Make \paragraph and \subparagraph free-standing
+\makeatletter
+\ifx\paragraph\undefined\else
+  \let\oldparagraph\paragraph
+  \renewcommand{\paragraph}{
+    \@ifstar
+      \xxxParagraphStar
+      \xxxParagraphNoStar
+  }
+  \newcommand{\xxxParagraphStar}[1]{\oldparagraph*{#1}\mbox{}}
+  \newcommand{\xxxParagraphNoStar}[1]{\oldparagraph{#1}\mbox{}}
+\fi
+\ifx\subparagraph\undefined\else
+  \let\oldsubparagraph\subparagraph
+  \renewcommand{\subparagraph}{
+    \@ifstar
+      \xxxSubParagraphStar
+      \xxxSubParagraphNoStar
+  }
+  \newcommand{\xxxSubParagraphStar}[1]{\oldsubparagraph*{#1}\mbox{}}
+  \newcommand{\xxxSubParagraphNoStar}[1]{\oldsubparagraph{#1}\mbox{}}
+\fi
+\makeatother
+
+\usepackage{color}
+\usepackage{fancyvrb}
+\newcommand{\VerbBar}{|}
+\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
+\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
+% Add ',fontsize=\small' for more characters per line
+\usepackage{framed}
+\definecolor{shadecolor}{RGB}{241,243,245}
+\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
+\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
+\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
+\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.40,0.45,0.13}{#1}}
+\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
+\newcommand{\BuiltInTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
+\newcommand{\CharTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
+\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
+\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
+\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
+\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{\textbf{#1}}}
+\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
+\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
+\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
+\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
+\newcommand{\ExtensionTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
+\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
+\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.28,0.35,0.67}{#1}}
+\newcommand{\ImportTok}[1]{\textcolor[rgb]{0.00,0.46,0.62}{#1}}
+\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
+\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{\textbf{#1}}}
+\newcommand{\NormalTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
+\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
+\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
+\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
+\newcommand{\RegionMarkerTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
+\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
+\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
+\newcommand{\StringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
+\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.07,0.07,0.07}{#1}}
+\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
+\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
+
+\providecommand{\tightlist}{%
+  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array}
+\usepackage{multirow}
+\usepackage{calc} % for calculating minipage widths
+% Correct order of tables after \paragraph or \subparagraph
+\usepackage{etoolbox}
+\makeatletter
+\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
+\makeatother
+% Allow footnotes in longtable head/foot
+\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
+\makesavenoteenv{longtable}
+\usepackage{graphicx}
+\makeatletter
+\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
+\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
+\makeatother
+% Scale images if necessary, so that they will not overflow the page
+% margins by default, and it is still possible to overwrite the defaults
+% using explicit options in \includegraphics[width, height, ...]{}
+\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
+% Set default figure placement to htbp
+\makeatletter
+\def\fps@figure{htbp}
+\makeatother
+
+\KOMAoption{captions}{tableheading}
+\makeatletter
+\@ifpackageloaded{tcolorbox}{}{\usepackage[skins,breakable]{tcolorbox}}
+\@ifpackageloaded{fontawesome5}{}{\usepackage{fontawesome5}}
+\definecolor{quarto-callout-color}{HTML}{909090}
+\definecolor{quarto-callout-note-color}{HTML}{0758E5}
+\definecolor{quarto-callout-important-color}{HTML}{CC1914}
+\definecolor{quarto-callout-warning-color}{HTML}{EB9113}
+\definecolor{quarto-callout-tip-color}{HTML}{00A047}
+\definecolor{quarto-callout-caution-color}{HTML}{FC5300}
+\definecolor{quarto-callout-color-frame}{HTML}{acacac}
+\definecolor{quarto-callout-note-color-frame}{HTML}{4582ec}
+\definecolor{quarto-callout-important-color-frame}{HTML}{d9534f}
+\definecolor{quarto-callout-warning-color-frame}{HTML}{f0ad4e}
+\definecolor{quarto-callout-tip-color-frame}{HTML}{02b875}
+\definecolor{quarto-callout-caution-color-frame}{HTML}{fd7e14}
+\makeatother
+\makeatletter
+\@ifpackageloaded{bookmark}{}{\usepackage{bookmark}}
+\makeatother
+\makeatletter
+\@ifpackageloaded{caption}{}{\usepackage{caption}}
+\AtBeginDocument{%
+\ifdefined\contentsname
+  \renewcommand*\contentsname{Table of contents}
+\else
+  \newcommand\contentsname{Table of contents}
+\fi
+\ifdefined\listfigurename
+  \renewcommand*\listfigurename{List of Figures}
+\else
+  \newcommand\listfigurename{List of Figures}
+\fi
+\ifdefined\listtablename
+  \renewcommand*\listtablename{List of Tables}
+\else
+  \newcommand\listtablename{List of Tables}
+\fi
+\ifdefined\figurename
+  \renewcommand*\figurename{Figure}
+\else
+  \newcommand\figurename{Figure}
+\fi
+\ifdefined\tablename
+  \renewcommand*\tablename{Table}
+\else
+  \newcommand\tablename{Table}
+\fi
+}
+\@ifpackageloaded{float}{}{\usepackage{float}}
+\floatstyle{ruled}
+\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]}
+\floatname{codelisting}{Listing}
+\newcommand*\listoflistings{\listof{codelisting}{List of Listings}}
+\makeatother
+\makeatletter
+\makeatother
+\makeatletter
+\@ifpackageloaded{caption}{}{\usepackage{caption}}
+\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}}
+\makeatother
+
+\ifLuaTeX
+  \usepackage{selnolig}  % disable illegal ligatures
+\fi
+\usepackage{bookmark}
+
+\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
+\urlstyle{same} % disable monospaced font for URLs
+\hypersetup{
+  pdftitle={Principles and Techniques of Data Science},
+  pdfauthor={Bella Crouch; Yash Dave; Ian Dong; Kanu Grover; Ishani Gupta; Minh Phan; Nikhil Reddy; Milad Shafaie; Matthew Shen; Lillian Weng},
+  colorlinks=true,
+  linkcolor={blue},
+  filecolor={Maroon},
+  citecolor={Blue},
+  urlcolor={Blue},
+  pdfcreator={LaTeX via pandoc}}
+
+
+\title{Principles and Techniques of Data Science}
+\usepackage{etoolbox}
+\makeatletter
+\providecommand{\subtitle}[1]{% add subtitle to \maketitle
+  \apptocmd{\@title}{\par {\large #1 \par}}{}{}
+}
+\makeatother
+\subtitle{Data 100}
+\author{Bella Crouch \and Yash Dave \and Ian Dong \and Kanu
+Grover \and Ishani Gupta \and Minh Phan \and Nikhil Reddy \and Milad
+Shafaie \and Matthew Shen \and Lillian Weng}
+\date{}
+
+\begin{document}
+\maketitle
+
+\renewcommand*\contentsname{Table of contents}
+{
+\hypersetup{linkcolor=}
+\setcounter{tocdepth}{2}
+\tableofcontents
+}
+
+\bookmarksetup{startatroot}
+
+\chapter*{Welcome}\label{welcome}
+\addcontentsline{toc}{chapter}{Welcome}
+
+\markboth{Welcome}{Welcome}
+
+\section*{About the Course Notes}\label{about-the-course-notes}
+\addcontentsline{toc}{section}{About the Course Notes}
+
+\markright{About the Course Notes}
+
+This text offers supplementary resources to accompany lectures presented
+in the Fall 2024 Edition of the UC Berkeley course Data 100: Principles
+and Techniques of Data Science.
+
+New notes will be added each week to accompany live lectures. See the
+full calendar of lectures on the \href{https://ds100.org/fa24/}{course
+website}.
+
+If you spot any typos or would like to suggest any changes, please email
+us at \textbf{data100.instructors@berkeley.edu}.
+
+\bookmarksetup{startatroot}
+
+\chapter{Introduction}\label{introduction}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Acquaint yourself with the overarching goals of Data 100
+\item
+  Understand the stages of the data science lifecycle
+\end{itemize}
+
+\end{tcolorbox}
+
+Data science is an interdisciplinary field with a variety of
+applications and offers great potential to address challenging societal
+issues. By building data science skills, you can empower yourself to
+participate in and drive conversations that shape your life and society
+as a whole, whether that be fighting against climate change, launching
+diversity initiatives, or more.
+
+The field of data science is rapidly evolving; many of the key technical
+underpinnings in modern-day data science have been popularized during
+the early 21\textsuperscript{st} century, and you will learn them
+throughout the course. It has a wide range of applications from science
+and medicine to sports.
+
+While data science has immense potential to address challenging problems
+facing society by enhancing our critical thinking, it can also be used
+obscure complex decisions and reinforce historical trends and biases.
+This course will implore you to consider the ethics of data science
+within its applications.
+
+Data science is fundamentally human-centered and facilitates
+decision-making by quantitatively balancing tradeoffs. To quantify
+things reliably, we must use and analyze data appropriately, apply
+critical thinking and skepticism at every step of the way, and consider
+how our decisions affect others.
+
+Ultimately, data science is the application of data-centric,
+computational, and inferential thinking to:
+
+\begin{itemize}
+\tightlist
+\item
+  Understand the world (science).
+\item
+  Solve problems (engineering).
+\end{itemize}
+
+A true mastery of data science requires a deep theoretical understanding
+and strong grasp of domain expertise. This course will help you build on
+the former -- specifically, the foundation of your technical knowledge,
+allowing you to take data and produce useful insights on the world's
+most challenging and ambiguous problems.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Course Goals}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Prepare you for advanced Berkeley courses in \textbf{data management,
+  machine learning, and statistics.}
+\item
+  Enable you to launch a career as a data scientist by providing
+  experience working with \textbf{real-world data, tools, and
+  techniques}.
+\item
+  Empower you to apply computational and inferential thinking to address
+  \textbf{real-world problems}.
+\end{itemize}
+
+\end{tcolorbox}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Some Topics We'll Cover}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{pandas} and \texttt{NumPy}
+\item
+  Exploratory Data Analysis
+\item
+  Regular Expressions
+\item
+  Visualization
+\item
+  Sampling
+\item
+  Model Design and Loss Formulation
+\item
+  Linear Regression
+\item
+  Gradient Descent
+\item
+  Logistic Regression
+\item
+  Clustering
+\item
+  PCA
+\end{itemize}
+
+\end{tcolorbox}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Prerequisites}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+To ensure that you can get the most out of the course content, please
+make sure that you are familiar with:
+
+\begin{itemize}
+\tightlist
+\item
+  Using Python.
+\item
+  Using Jupyter notebooks.
+\item
+  Inference from Data 8.
+\item
+  Linear algebra
+\end{itemize}
+
+\end{tcolorbox}
+
+To set you up for success, we've organized concepts in Data 100 around
+the \textbf{data science lifecycle}: an \emph{iterative} process that
+encompasses the various statistical and computational building blocks of
+data science.
+
+\section{Data Science Lifecycle}\label{data-science-lifecycle}
+
+The data science lifecycle is a \emph{high-level overview} of the data
+science workflow. It's a cycle of stages that a data scientist should
+explore as they conduct a thorough analysis of a data-driven problem.
+
+There are many variations of the key ideas present in the data science
+lifecycle. In Data 100, we visualize the stages of the lifecycle using a
+flow diagram. Notice how there are two entry points.
+
+\subsection{Ask a Question}\label{ask-a-question}
+
+Whether by curiosity or necessity, data scientists constantly ask
+questions. For example, in the business world, data scientists may be
+interested in predicting the profit generated by a certain investment.
+In the field of medicine, they may ask whether some patients are more
+likely than others to benefit from a treatment.
+
+Posing questions is one of the primary ways the data science lifecycle
+begins. It helps to fully define the question. Here are some things you
+should ask yourself before framing a question.
+
+\begin{itemize}
+\tightlist
+\item
+  What do we want to know?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    A question that is too ambiguous may lead to confusion.
+  \end{itemize}
+\item
+  What problems are we trying to solve?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    The goal of asking a question should be clear in order to justify
+    your efforts to stakeholders.
+  \end{itemize}
+\item
+  What are the hypotheses we want to test?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    This gives a clear perspective from which to analyze final results.
+  \end{itemize}
+\item
+  What are the metrics for our success?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    This establishes a clear point to know when to conclude the project.
+  \end{itemize}
+\end{itemize}
+
+\subsection{Obtain Data}\label{obtain-data}
+
+The second entry point to the lifecycle is by obtaining data. A careful
+analysis of any problem requires the use of data. Data may be readily
+available to us, or we may have to embark on a process to collect it.
+When doing so, it is crucial to ask the following:
+
+\begin{itemize}
+\tightlist
+\item
+  What data do we have, and what data do we need?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Define the units of the data (people, cities, points in time, etc.)
+    and what features to measure.
+  \end{itemize}
+\item
+  How will we sample more data?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Scrape the web, collect manually, run experiments, etc.
+  \end{itemize}
+\item
+  Is our data representative of the population we want to study?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    If our data is not representative of our population of interest,
+    then we can come to incorrect conclusions.
+  \end{itemize}
+\end{itemize}
+
+Key procedures: \emph{data acquisition}, \emph{data cleaning}
+
+\subsection{Understand the Data}\label{understand-the-data}
+
+Raw data itself is not inherently useful. It's impossible to discern all
+the patterns and relationships between variables without carefully
+investigating them. Therefore, translating pure data into actionable
+insights is a key job of a data scientist. For example, we may choose to
+ask:
+
+\begin{itemize}
+\tightlist
+\item
+  How is our data organized, and what does it contain?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Knowing what the data says about the world helps us better
+    understand the world.
+  \end{itemize}
+\item
+  Do we have relevant data?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    If the data we have collected is not useful to the question at hand,
+    then we must collect more data.
+  \end{itemize}
+\item
+  What are the biases, anomalies, or other issues with the data?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    These can lead to many false conclusions if ignored, so data
+    scientists must always be aware of these issues.
+  \end{itemize}
+\item
+  How do we transform the data to enable effective analysis?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Data is not always easy to interpret at first glance, so a data
+    scientist should strive to reveal the hidden insights.
+  \end{itemize}
+\end{itemize}
+
+Key procedures: \emph{exploratory data analysis}, \emph{data
+visualization}.
+
+\subsection{Understand the World}\label{understand-the-world}
+
+After observing the patterns in our data, we can begin answering our
+questions. This may require that we predict a quantity (machine
+learning) or measure the effect of some treatment (inference).
+
+From here, we may choose to report our results, or possibly conduct more
+analysis. We may not be satisfied with our findings, or our initial
+exploration may have brought up new questions that require new data.
+
+\begin{itemize}
+\tightlist
+\item
+  What does the data say about the world?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Given our models, the data will lead us to certain conclusions about
+    the real world.\\
+  \end{itemize}
+\item
+  Does it answer our questions or accurately solve the problem?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    If our model and data can not accomplish our goals, then we must
+    reform our question, model, or both.\\
+  \end{itemize}
+\item
+  How robust are our conclusions and can we trust the predictions?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Inaccurate models can lead to false conclusions.
+  \end{itemize}
+\end{itemize}
+
+Key procedures: \emph{model creation}, \emph{prediction},
+\emph{inference}.
+
+\section{Conclusion}\label{conclusion}
+
+The data science lifecycle is meant to be a set of general guidelines
+rather than a hard set of requirements. In our journey exploring the
+lifecycle, we'll cover both the underlying theory and technologies used
+in data science. By the end of the course, we hope that you start to see
+yourself as a data scientist.
+
+With that, we'll begin by introducing one of the most important tools in
+exploratory data analysis: \texttt{pandas}.
+
+\bookmarksetup{startatroot}
+
+\chapter{Pandas I}\label{pandas-i}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Build familiarity with \texttt{pandas} and \texttt{pandas} syntax.
+\item
+  Learn key data structures: \texttt{DataFrame}, \texttt{Series}, and
+  \texttt{Index}.
+\item
+  Understand methods for extracting data: \texttt{.loc}, \texttt{.iloc},
+  and \texttt{{[}{]}}.
+\end{itemize}
+
+\end{tcolorbox}
+
+In this sequence of lectures, we will dive right into things by having
+you explore and manipulate real-world data. We'll first introduce
+\texttt{pandas}, a popular Python library for interacting with
+\textbf{tabular data}.
+
+\section{Tabular Data}\label{tabular-data}
+
+Data scientists work with data stored in a variety of formats. This
+class focuses primarily on \emph{tabular data} --- data that is stored
+in a table.
+
+Tabular data is one of the most common systems that data scientists use
+to organize data. This is in large part due to the simplicity and
+flexibility of tables. Tables allow us to represent each
+\textbf{observation}, or instance of collecting data from an individual,
+as its own \emph{row}. We can record each observation's distinct
+characteristics, or \textbf{features}, in separate \emph{columns}.
+
+To see this in action, we'll explore the \texttt{elections} dataset,
+which stores information about political candidates who ran for
+president of the United States in previous years.
+
+In the \texttt{elections} dataset, each row (blue box) represents one
+instance of a candidate running for president in a particular year. For
+example, the first row represents Andrew Jackson running for president
+in the year 1824. Each column (yellow box) represents one characteristic
+piece of information about each presidential candidate. For example, the
+column named ``Result'' stores whether or not the candidate won the
+election.
+
+Your work in Data 8 helped you grow very familiar with using and
+interpreting data stored in a tabular format. Back then, you used the
+\texttt{Table} class of the \texttt{datascience} library, a special
+programming library created specifically for Data 8 students.
+
+In Data 100, we will be working with the programming library
+\texttt{pandas}, which is generally accepted in the data science
+community as the industry- and academia-standard tool for manipulating
+tabular data (as well as the inspiration for Petey, our panda bear
+mascot).
+
+Using \texttt{pandas}, we can
+
+\begin{itemize}
+\tightlist
+\item
+  Arrange data in a tabular format.
+\item
+  Extract useful information filtered by specific conditions.
+\item
+  Operate on data to gain new insights.
+\item
+  Apply \texttt{NumPy} functions to our data (our friends from Data 8).
+\item
+  Perform vectorized computations to speed up our analysis (Lab 1).
+\end{itemize}
+
+\section{\texorpdfstring{\texttt{Series}, \texttt{DataFrame}s, and
+Indices}{Series, DataFrames, and Indices}}\label{series-dataframes-and-indices}
+
+To begin our work in \texttt{pandas}, we must first import the library
+into our Python environment. This will allow us to use \texttt{pandas}
+data structures and methods in our code.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# \textasciigrave{}pd\textasciigrave{} is the conventional alias for Pandas, as \textasciigrave{}np\textasciigrave{} is for NumPy}
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\end{Highlighting}
+\end{Shaded}
+
+There are three fundamental data structures in \texttt{pandas}:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \textbf{\texttt{Series}}: 1D labeled array data; best thought of as
+  columnar data.
+\item
+  \textbf{\texttt{DataFrame}}: 2D tabular data with rows and columns.
+\item
+  \textbf{\texttt{Index}}: A sequence of row/column labels.
+\end{enumerate}
+
+\texttt{DataFrame}s, \texttt{Series}, and Indices can be represented
+visually in the following diagram, which considers the first few rows of
+the \texttt{elections} dataset.
+
+Notice how the \textbf{DataFrame} is a two-dimensional object --- it
+contains both rows and columns. The \textbf{Series} above is a singular
+column of this \texttt{DataFrame}, namely the \texttt{Result} column.
+Both contain an \textbf{Index}, or a shared list of row labels (the
+integers from 0 to 4, inclusive).
+
+\subsection{Series}\label{series}
+
+A \texttt{Series} represents a column of a \texttt{DataFrame}; more
+generally, it can be any 1-dimensional array-like object. It contains
+both:
+
+\begin{itemize}
+\tightlist
+\item
+  A sequence of \textbf{values} of the same type.
+\item
+  A sequence of data labels called the \textbf{index}.
+\end{itemize}
+
+In the cell below, we create a \texttt{Series} named \texttt{s}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{s }\OperatorTok{=}\NormalTok{ pd.Series([}\StringTok{"welcome"}\NormalTok{, }\StringTok{"to"}\NormalTok{, }\StringTok{"data 100"}\NormalTok{])}
+\NormalTok{s}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0     welcome
+1          to
+2    data 100
+dtype: object
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+ \CommentTok{\# Accessing data values within the Series}
+\NormalTok{ s.values}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array(['welcome', 'to', 'data 100'], dtype=object)
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+ \CommentTok{\# Accessing the Index of the Series}
+\NormalTok{ s.index}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+RangeIndex(start=0, stop=3, step=1)
+\end{verbatim}
+
+By default, the \texttt{index} of a \texttt{Series} is a sequential list
+of integers beginning from 0. Optionally, a manually specified list of
+desired indices can be passed to the \texttt{index} argument.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{s }\OperatorTok{=}\NormalTok{ pd.Series([}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{2}\NormalTok{], index }\OperatorTok{=}\NormalTok{ [}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{])}
+\NormalTok{s}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+a    -1
+b    10
+c     2
+dtype: int64
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{s.index}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Index(['a', 'b', 'c'], dtype='object')
+\end{verbatim}
+
+Indices can also be changed after initialization.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{s.index }\OperatorTok{=}\NormalTok{ [}\StringTok{"first"}\NormalTok{, }\StringTok{"second"}\NormalTok{, }\StringTok{"third"}\NormalTok{]}
+\NormalTok{s}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+first     -1
+second    10
+third      2
+dtype: int64
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{s.index}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Index(['first', 'second', 'third'], dtype='object')
+\end{verbatim}
+
+\subsubsection{\texorpdfstring{Selection in
+\texttt{Series}}{Selection in Series}}\label{selection-in-series}
+
+Much like when working with \texttt{NumPy} arrays, we can select a
+single value or a set of values from a \texttt{Series}. To do so, there
+are three primary methods:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  A single label.
+\item
+  A list of labels.
+\item
+  A filtering condition.
+\end{enumerate}
+
+To demonstrate this, let's define the Series \texttt{ser}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{ser }\OperatorTok{=}\NormalTok{ pd.Series([}\DecValTok{4}\NormalTok{, }\OperatorTok{{-}}\DecValTok{2}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{6}\NormalTok{], index }\OperatorTok{=}\NormalTok{ [}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{, }\StringTok{"d"}\NormalTok{])}
+\NormalTok{ser}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+a    4
+b   -2
+c    0
+d    6
+dtype: int64
+\end{verbatim}
+
+\paragraph{A Single Label}\label{a-single-label}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# We return the value stored at the index label "a"}
+\NormalTok{ser[}\StringTok{"a"}\NormalTok{] }
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.int64(4)
+\end{verbatim}
+
+\paragraph{A List of Labels}\label{a-list-of-labels}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# We return a Series of the values stored at the index labels "a" and "c"}
+\NormalTok{ser[[}\StringTok{"a"}\NormalTok{, }\StringTok{"c"}\NormalTok{]] }
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+a    4
+c    0
+dtype: int64
+\end{verbatim}
+
+\paragraph{A Filtering Condition}\label{a-filtering-condition}
+
+Perhaps the most interesting (and useful) method of selecting data from
+a \texttt{Series} is by using a filtering condition.
+
+First, we apply a boolean operation to the \texttt{Series}. This creates
+\textbf{a new \texttt{Series} of boolean values}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Filter condition: select all elements greater than 0}
+\NormalTok{ser }\OperatorTok{\textgreater{}} \DecValTok{0} 
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+a     True
+b    False
+c    False
+d     True
+dtype: bool
+\end{verbatim}
+
+We then use this boolean condition to index into our original
+\texttt{Series}. \texttt{pandas} will select only the entries in the
+original \texttt{Series} that satisfy the condition.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{ser[ser }\OperatorTok{\textgreater{}} \DecValTok{0}\NormalTok{] }
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+a    4
+d    6
+dtype: int64
+\end{verbatim}
+
+\subsection{\texorpdfstring{\texttt{DataFrames}}{DataFrames}}\label{dataframes}
+
+Typically, we will work with \texttt{Series} using the perspective that
+they are columns in a \texttt{DataFrame}. We can think of a
+\textbf{\texttt{DataFrame}} as a collection of \textbf{\texttt{Series}}
+that all share the same \textbf{\texttt{Index}}.
+
+In Data 8, you encountered the \texttt{Table} class of the
+\texttt{datascience} library, which represented tabular data. In Data
+100, we'll be using the \texttt{DataFrame} class of the \texttt{pandas}
+library.
+
+\subsubsection{\texorpdfstring{Creating a
+\texttt{DataFrame}}{Creating a DataFrame}}\label{creating-a-dataframe}
+
+There are many ways to create a \texttt{DataFrame}. Here, we will cover
+the most popular approaches:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  From a CSV file.
+\item
+  Using a list and column name(s).
+\item
+  From a dictionary.
+\item
+  From a \texttt{Series}.
+\end{enumerate}
+
+More generally, the syntax for creating a \texttt{DataFrame} is:
+
+\begin{verbatim}
+ pandas.DataFrame(data, index, columns)
+\end{verbatim}
+
+\paragraph{From a CSV file}\label{from-a-csv-file}
+
+In Data 100, our data are typically stored in a CSV (comma-separated
+values) file format. We can import a CSV file into a \texttt{DataFrame}
+by passing the data path as an argument to the following \texttt{pandas}
+function.  \texttt{pd.read\_csv("filename.csv")}
+
+With our new understanding of \texttt{pandas} in hand, let's return to
+the \texttt{elections} dataset from before. Now, we can recognize that
+it is represented as a \texttt{pandas} \texttt{DataFrame}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/elections.csv"}\NormalTok{)}
+\NormalTok{elections}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.210122 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.789878 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.203927 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.796073 \\
+4 & 1832 & Andrew Jackson & Democratic & 702735 & win & 54.574789 \\
+... & ... & ... & ... & ... & ... & ... \\
+177 & 2016 & Jill Stein & Green & 1457226 & loss & 1.073699 \\
+178 & 2020 & Joseph Biden & Democratic & 81268924 & win & 51.311515 \\
+179 & 2020 & Donald Trump & Republican & 74216154 & loss & 46.858542 \\
+180 & 2020 & Jo Jorgensen & Libertarian & 1865724 & loss & 1.177979 \\
+181 & 2020 & Howard Hawkins & Green & 405035 & loss & 0.255731 \\
+\end{longtable}
+
+This code stores our \texttt{DataFrame} object in the \texttt{elections}
+variable. Upon inspection, our \texttt{elections} \texttt{DataFrame} has
+182 rows and 6 columns (\texttt{Year}, \texttt{Candidate},
+\texttt{Party}, \texttt{Popular\ Vote}, \texttt{Result}, \texttt{\%}).
+Each row represents a single record --- in our example, a presidential
+candidate from some particular year. Each column represents a single
+attribute or feature of the record.
+
+\paragraph{Using a List and Column
+Name(s)}\label{using-a-list-and-column-names}
+
+We'll now explore creating a \texttt{DataFrame} with data of our own.
+
+Consider the following examples. The first code cell creates a
+\texttt{DataFrame} with a single column \texttt{Numbers}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{df\_list }\OperatorTok{=}\NormalTok{ pd.DataFrame([}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{], columns}\OperatorTok{=}\NormalTok{[}\StringTok{"Numbers"}\NormalTok{])}
+\NormalTok{df\_list}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Numbers \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1 \\
+1 & 2 \\
+2 & 3 \\
+\end{longtable}
+
+The second creates a \texttt{DataFrame} with the columns
+\texttt{Numbers} and \texttt{Description}. Notice how a 2D list of
+values is required to initialize the second \texttt{DataFrame} --- each
+nested list represents a single row of data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{df\_list }\OperatorTok{=}\NormalTok{ pd.DataFrame([[}\DecValTok{1}\NormalTok{, }\StringTok{"one"}\NormalTok{], [}\DecValTok{2}\NormalTok{, }\StringTok{"two"}\NormalTok{]], columns }\OperatorTok{=}\NormalTok{ [}\StringTok{"Number"}\NormalTok{, }\StringTok{"Description"}\NormalTok{])}
+\NormalTok{df\_list}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Number & Description \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1 & one \\
+1 & 2 & two \\
+\end{longtable}
+
+\paragraph{From a Dictionary}\label{from-a-dictionary}
+
+A third (and more common) way to create a \texttt{DataFrame} is with a
+dictionary. The dictionary keys represent the column names, and the
+dictionary values represent the column values.
+
+Below are two ways of implementing this approach. The first is based on
+specifying the columns of the \texttt{DataFrame}, whereas the second is
+based on specifying the rows of the \texttt{DataFrame}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{df\_dict }\OperatorTok{=}\NormalTok{ pd.DataFrame(\{}
+    \StringTok{"Fruit"}\NormalTok{: [}\StringTok{"Strawberry"}\NormalTok{, }\StringTok{"Orange"}\NormalTok{], }
+    \StringTok{"Price"}\NormalTok{: [}\FloatTok{5.49}\NormalTok{, }\FloatTok{3.99}\NormalTok{]}
+\NormalTok{\})}
+\NormalTok{df\_dict}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Fruit & Price \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Strawberry & 5.49 \\
+1 & Orange & 3.99 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{df\_dict }\OperatorTok{=}\NormalTok{ pd.DataFrame(}
+\NormalTok{    [}
+\NormalTok{        \{}\StringTok{"Fruit"}\NormalTok{:}\StringTok{"Strawberry"}\NormalTok{, }\StringTok{"Price"}\NormalTok{:}\FloatTok{5.49}\NormalTok{\}, }
+\NormalTok{        \{}\StringTok{"Fruit"}\NormalTok{: }\StringTok{"Orange"}\NormalTok{, }\StringTok{"Price"}\NormalTok{:}\FloatTok{3.99}\NormalTok{\}}
+\NormalTok{    ]}
+\NormalTok{)}
+\NormalTok{df\_dict}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Fruit & Price \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Strawberry & 5.49 \\
+1 & Orange & 3.99 \\
+\end{longtable}
+
+\paragraph{\texorpdfstring{From a
+\texttt{Series}}{From a Series}}\label{from-a-series}
+
+Earlier, we explained how a \texttt{Series} was synonymous to a column
+in a \texttt{DataFrame}. It follows, then, that a \texttt{DataFrame} is
+equivalent to a collection of \texttt{Series}, which all share the same
+\texttt{Index}.
+
+In fact, we can initialize a \texttt{DataFrame} by merging two or more
+\texttt{Series}. Consider the \texttt{Series} \texttt{s\_a} and
+\texttt{s\_b}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Notice how our indices, or row labels, are the same}
+
+\NormalTok{s\_a }\OperatorTok{=}\NormalTok{ pd.Series([}\StringTok{"a1"}\NormalTok{, }\StringTok{"a2"}\NormalTok{, }\StringTok{"a3"}\NormalTok{], index }\OperatorTok{=}\NormalTok{ [}\StringTok{"r1"}\NormalTok{, }\StringTok{"r2"}\NormalTok{, }\StringTok{"r3"}\NormalTok{])}
+\NormalTok{s\_b }\OperatorTok{=}\NormalTok{ pd.Series([}\StringTok{"b1"}\NormalTok{, }\StringTok{"b2"}\NormalTok{, }\StringTok{"b3"}\NormalTok{], index }\OperatorTok{=}\NormalTok{ [}\StringTok{"r1"}\NormalTok{, }\StringTok{"r2"}\NormalTok{, }\StringTok{"r3"}\NormalTok{])}
+\end{Highlighting}
+\end{Shaded}
+
+We can turn individual \texttt{Series} into a \texttt{DataFrame} using
+two common methods (shown below):
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.DataFrame(s\_a)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& 0 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+r1 & a1 \\
+r2 & a2 \\
+r3 & a3 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{s\_b.to\_frame()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& 0 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+r1 & b1 \\
+r2 & b2 \\
+r3 & b3 \\
+\end{longtable}
+
+To merge the two \texttt{Series} and specify their column names, we use
+the following syntax:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.DataFrame(\{}
+    \StringTok{"A{-}column"}\NormalTok{: s\_a, }
+    \StringTok{"B{-}column"}\NormalTok{: s\_b}
+\NormalTok{\})}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& A-column & B-column \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+r1 & a1 & b1 \\
+r2 & a2 & b2 \\
+r3 & a3 & b3 \\
+\end{longtable}
+
+\subsection{Indices}\label{indices}
+
+On a more technical note, an index doesn't have to be an integer, nor
+does it have to be unique. For example, we can set the index of the
+\texttt{elections} \texttt{DataFrame} to be the name of presidential
+candidates.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Creating a DataFrame from a CSV file and specifying the index column}
+\NormalTok{elections }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/elections.csv"}\NormalTok{, index\_col }\OperatorTok{=} \StringTok{"Candidate"}\NormalTok{)}
+\NormalTok{elections}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& Year & Party & Popular vote & Result & \% \\
+Candidate & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Andrew Jackson & 1824 & Democratic-Republican & 151271 & loss &
+57.210122 \\
+John Quincy Adams & 1824 & Democratic-Republican & 113142 & win &
+42.789878 \\
+Andrew Jackson & 1828 & Democratic & 642806 & win & 56.203927 \\
+John Quincy Adams & 1828 & National Republican & 500897 & loss &
+43.796073 \\
+Andrew Jackson & 1832 & Democratic & 702735 & win & 54.574789 \\
+... & ... & ... & ... & ... & ... \\
+Jill Stein & 2016 & Green & 1457226 & loss & 1.073699 \\
+Joseph Biden & 2020 & Democratic & 81268924 & win & 51.311515 \\
+Donald Trump & 2020 & Republican & 74216154 & loss & 46.858542 \\
+Jo Jorgensen & 2020 & Libertarian & 1865724 & loss & 1.177979 \\
+Howard Hawkins & 2020 & Green & 405035 & loss & 0.255731 \\
+\end{longtable}
+
+We can also select a new column and set it as the index of the
+\texttt{DataFrame}. For example, we can set the index of the
+\texttt{elections} \texttt{DataFrame} to represent the candidate's
+party.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.reset\_index(inplace }\OperatorTok{=} \VariableTok{True}\NormalTok{) }\CommentTok{\# Resetting the index so we can set it again}
+\CommentTok{\# This sets the index to the "Party" column}
+\NormalTok{elections.set\_index(}\StringTok{"Party"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& Candidate & Year & Popular vote & Result & \% \\
+Party & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Democratic-Republican & Andrew Jackson & 1824 & 151271 & loss &
+57.210122 \\
+Democratic-Republican & John Quincy Adams & 1824 & 113142 & win &
+42.789878 \\
+Democratic & Andrew Jackson & 1828 & 642806 & win & 56.203927 \\
+National Republican & John Quincy Adams & 1828 & 500897 & loss &
+43.796073 \\
+Democratic & Andrew Jackson & 1832 & 702735 & win & 54.574789 \\
+... & ... & ... & ... & ... & ... \\
+Green & Jill Stein & 2016 & 1457226 & loss & 1.073699 \\
+Democratic & Joseph Biden & 2020 & 81268924 & win & 51.311515 \\
+Republican & Donald Trump & 2020 & 74216154 & loss & 46.858542 \\
+Libertarian & Jo Jorgensen & 2020 & 1865724 & loss & 1.177979 \\
+Green & Howard Hawkins & 2020 & 405035 & loss & 0.255731 \\
+\end{longtable}
+
+And, if we'd like, we can revert the index back to the default list of
+integers.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# This resets the index to be the default list of integer}
+\NormalTok{elections.reset\_index(inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{) }
+\NormalTok{elections.index}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+RangeIndex(start=0, stop=182, step=1)
+\end{verbatim}
+
+It is also important to note that the row labels that constitute an
+index don't have to be unique. While index values can be unique and
+numeric, acting as a row number, they can also be named and non-unique.
+
+Here we see unique and numeric index values.
+
+However, here the index values are not unique.
+
+\section{\texorpdfstring{\texttt{DataFrame} Attributes: Index, Columns,
+and
+Shape}{DataFrame Attributes: Index, Columns, and Shape}}\label{dataframe-attributes-index-columns-and-shape}
+
+On the other hand, column names in a \texttt{DataFrame} are almost
+always unique. Looking back to the \texttt{elections} dataset, it
+wouldn't make sense to have two columns named \texttt{"Candidate"}.
+Sometimes, you'll want to extract these different values, in particular,
+the list of row and column labels.
+
+For index/row labels, use \texttt{DataFrame.index}:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.set\_index(}\StringTok{"Party"}\NormalTok{, inplace }\OperatorTok{=} \VariableTok{True}\NormalTok{)}
+\NormalTok{elections.index}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Index(['Democratic-Republican', 'Democratic-Republican', 'Democratic',
+       'National Republican', 'Democratic', 'National Republican',
+       'Anti-Masonic', 'Whig', 'Democratic', 'Whig',
+       ...
+       'Constitution', 'Republican', 'Independent', 'Libertarian',
+       'Democratic', 'Green', 'Democratic', 'Republican', 'Libertarian',
+       'Green'],
+      dtype='object', name='Party', length=182)
+\end{verbatim}
+
+For column labels, use \texttt{DataFrame.columns}:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.columns}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Index(['index', 'Candidate', 'Year', 'Popular vote', 'Result', '%'], dtype='object')
+\end{verbatim}
+
+And for the shape of the \texttt{DataFrame}, we can use
+\texttt{DataFrame.shape} to get the number of rows followed by the
+number of columns:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.shape}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(182, 6)
+\end{verbatim}
+
+\section{\texorpdfstring{Slicing in
+\texttt{DataFrame}s}{Slicing in DataFrames}}\label{slicing-in-dataframes}
+
+Now that we've learned more about \texttt{DataFrame}s, let's dive deeper
+into their capabilities.
+
+The API (Application Programming Interface) for the \texttt{DataFrame}
+class is enormous. In this section, we'll discuss several methods of the
+\texttt{DataFrame} API that allow us to extract subsets of data.
+
+The simplest way to manipulate a \texttt{DataFrame} is to extract a
+subset of rows and columns, known as \textbf{slicing}.
+
+Common ways we may want to extract data are grabbing:
+
+\begin{itemize}
+\tightlist
+\item
+  The first or last \texttt{n} rows in the \texttt{DataFrame}.
+\item
+  Data with a certain label.
+\item
+  Data at a certain position.
+\end{itemize}
+
+We will do so with four primary methods of the \texttt{DataFrame} class:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \texttt{.head} and \texttt{.tail}
+\item
+  \texttt{.loc}
+\item
+  \texttt{.iloc}
+\item
+  \texttt{{[}{]}}
+\end{enumerate}
+
+\subsection{\texorpdfstring{Extracting data with \texttt{.head} and
+\texttt{.tail}}{Extracting data with .head and .tail}}\label{extracting-data-with-.head-and-.tail}
+
+The simplest scenario in which we want to extract data is when we simply
+want to select the first or last few rows of the \texttt{DataFrame}.
+
+To extract the first \texttt{n} rows of a \texttt{DataFrame}
+\texttt{df}, we use the syntax \texttt{df.head(n)}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/elections.csv"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Extract the first 5 rows of the DataFrame}
+\NormalTok{elections.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.210122 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.789878 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.203927 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.796073 \\
+4 & 1832 & Andrew Jackson & Democratic & 702735 & win & 54.574789 \\
+\end{longtable}
+
+Similarly, calling \texttt{df.tail(n)} allows us to extract the last
+\texttt{n} rows of the \texttt{DataFrame}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Extract the last 5 rows of the DataFrame}
+\NormalTok{elections.tail(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+177 & 2016 & Jill Stein & Green & 1457226 & loss & 1.073699 \\
+178 & 2020 & Joseph Biden & Democratic & 81268924 & win & 51.311515 \\
+179 & 2020 & Donald Trump & Republican & 74216154 & loss & 46.858542 \\
+180 & 2020 & Jo Jorgensen & Libertarian & 1865724 & loss & 1.177979 \\
+181 & 2020 & Howard Hawkins & Green & 405035 & loss & 0.255731 \\
+\end{longtable}
+
+\subsection{\texorpdfstring{Label-based Extraction: Indexing with
+\texttt{.loc}}{Label-based Extraction: Indexing with .loc}}\label{label-based-extraction-indexing-with-.loc}
+
+For the more complex task of extracting data with specific column or
+index labels, we can use \texttt{.loc}. The \texttt{.loc} accessor
+allows us to specify the \textbf{\emph{labels}} of rows and columns we
+wish to extract. The \textbf{labels} (commonly referred to as the
+\textbf{indices}) are the bold text on the far \emph{left} of a
+\texttt{DataFrame}, while the \textbf{column labels} are the column
+names found at the \emph{top} of a \texttt{DataFrame}.
+
+To grab data with \texttt{.loc}, we must specify the row and column
+label(s) where the data exists. The row labels are the first argument to
+the \texttt{.loc} function; the column labels are the second.
+
+Arguments to \texttt{.loc} can be:
+
+\begin{itemize}
+\tightlist
+\item
+  A single value.
+\item
+  A slice.
+\item
+  A list.
+\end{itemize}
+
+For example, to select a single value, we can select the row labeled
+\texttt{0} and the column labeled \texttt{Candidate} from the
+\texttt{elections} \texttt{DataFrame}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.loc[}\DecValTok{0}\NormalTok{, }\StringTok{\textquotesingle{}Candidate\textquotesingle{}}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'Andrew Jackson'
+\end{verbatim}
+
+Keep in mind that passing in just one argument as a single value will
+produce a \texttt{Series}. Below, we've extracted a subset of the
+\texttt{"Popular\ vote"} column as a \texttt{Series}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.loc[[}\DecValTok{87}\NormalTok{, }\DecValTok{25}\NormalTok{, }\DecValTok{179}\NormalTok{], }\StringTok{"Popular vote"}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+87     15761254
+25       848019
+179    74216154
+Name: Popular vote, dtype: int64
+\end{verbatim}
+
+To select \emph{multiple} rows and columns, we can use Python slice
+notation. Here, we select the rows from labels \texttt{0} to \texttt{3}
+and the columns from labels \texttt{"Year"} to \texttt{"Popular\ vote"}.
+Notice that unlike Python slicing, \texttt{.loc} is \emph{inclusive} of
+the right upper bound.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.loc[}\DecValTok{0}\NormalTok{:}\DecValTok{3}\NormalTok{, }\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}Popular vote\textquotesingle{}}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 \\
+\end{longtable}
+
+Suppose that instead, we want to extract \emph{all} column values for
+the first four rows in the \texttt{elections} \texttt{DataFrame}. The
+shorthand \texttt{:} is useful for this.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.loc[}\DecValTok{0}\NormalTok{:}\DecValTok{3}\NormalTok{, :]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.210122 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.789878 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.203927 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.796073 \\
+\end{longtable}
+
+We can use the same shorthand to extract all rows.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.loc[:, [}\StringTok{"Year"}\NormalTok{, }\StringTok{"Candidate"}\NormalTok{, }\StringTok{"Result"}\NormalTok{]]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& Year & Candidate & Result \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & loss \\
+1 & 1824 & John Quincy Adams & win \\
+2 & 1828 & Andrew Jackson & win \\
+3 & 1828 & John Quincy Adams & loss \\
+4 & 1832 & Andrew Jackson & win \\
+... & ... & ... & ... \\
+177 & 2016 & Jill Stein & loss \\
+178 & 2020 & Joseph Biden & win \\
+179 & 2020 & Donald Trump & loss \\
+180 & 2020 & Jo Jorgensen & loss \\
+181 & 2020 & Howard Hawkins & loss \\
+\end{longtable}
+
+There are a couple of things we should note. Firstly, unlike
+conventional Python, \texttt{pandas} allows us to slice string values
+(in our example, the column labels). Secondly, slicing with
+\texttt{.loc} is \emph{inclusive}. Notice how our resulting
+\texttt{DataFrame} includes every row and column between and including
+the slice labels we specified.
+
+Equivalently, we can use a list to obtain multiple rows and columns in
+our \texttt{elections} \texttt{DataFrame}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.loc[[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{], [}\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Candidate\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Party\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Popular vote\textquotesingle{}}\NormalTok{]]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 \\
+\end{longtable}
+
+Lastly, we can interchange list and slicing notation.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.loc[[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{], :]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.210122 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.789878 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.203927 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.796073 \\
+\end{longtable}
+
+\subsection{\texorpdfstring{Integer-based Extraction: Indexing with
+\texttt{.iloc}}{Integer-based Extraction: Indexing with .iloc}}\label{integer-based-extraction-indexing-with-.iloc}
+
+Slicing with \texttt{.iloc} works similarly to \texttt{.loc}. However,
+\texttt{.iloc} uses the \emph{index positions} of rows and columns
+rather than the labels (think to yourself: \textbf{l}oc uses
+\textbf{l}ables; \textbf{i}loc uses \textbf{i}ndices). The arguments to
+the \texttt{.iloc} function also behave similarly --- single values,
+lists, indices, and any combination of these are permitted.
+
+Let's begin reproducing our results from above. We'll begin by selecting
+the first presidential candidate in our \texttt{elections}
+\texttt{DataFrame}:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# elections.loc[0, "Candidate"] {-} Previous approach}
+\NormalTok{elections.iloc[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'Andrew Jackson'
+\end{verbatim}
+
+Notice how the first argument to both \texttt{.loc} and \texttt{.iloc}
+are the same. This is because the row with a label of \texttt{0} is
+conveniently in the \(0^{\text{th}}\) (equivalently, the first position)
+of the \texttt{elections} \texttt{DataFrame}. Generally, this is true of
+any \texttt{DataFrame} where the row labels are incremented in ascending
+order from 0.
+
+And, as before, if we were to pass in only one single value argument,
+our result would be a \texttt{Series}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.iloc[[}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{3}\NormalTok{],}\DecValTok{1}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+1    John Quincy Adams
+2       Andrew Jackson
+3    John Quincy Adams
+Name: Candidate, dtype: object
+\end{verbatim}
+
+However, when we select the first four rows and columns using
+\texttt{.iloc}, we notice something.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# elections.loc[0:3, \textquotesingle{}Year\textquotesingle{}:\textquotesingle{}Popular vote\textquotesingle{}] {-} Previous approach}
+\NormalTok{elections.iloc[}\DecValTok{0}\NormalTok{:}\DecValTok{4}\NormalTok{, }\DecValTok{0}\NormalTok{:}\DecValTok{4}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 \\
+\end{longtable}
+
+Slicing is no longer inclusive in \texttt{.iloc} --- it's
+\emph{exclusive}. In other words, the right end of a slice is not
+included when using \texttt{.iloc}. This is one of the subtleties of
+\texttt{pandas} syntax; you will get used to it with practice.
+
+List behavior works just as expected.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\#elections.loc[[0, 1, 2, 3], [\textquotesingle{}Year\textquotesingle{}, \textquotesingle{}Candidate\textquotesingle{}, \textquotesingle{}Party\textquotesingle{}, \textquotesingle{}Popular vote\textquotesingle{}]] {-} Previous Approach}
+\NormalTok{elections.iloc[[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{], [}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{]]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 \\
+\end{longtable}
+
+And just like with \texttt{.loc}, we can use a colon with \texttt{.iloc}
+to extract all rows or columns.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.iloc[:, }\DecValTok{0}\NormalTok{:}\DecValTok{3}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican \\
+1 & 1824 & John Quincy Adams & Democratic-Republican \\
+2 & 1828 & Andrew Jackson & Democratic \\
+3 & 1828 & John Quincy Adams & National Republican \\
+4 & 1832 & Andrew Jackson & Democratic \\
+... & ... & ... & ... \\
+177 & 2016 & Jill Stein & Green \\
+178 & 2020 & Joseph Biden & Democratic \\
+179 & 2020 & Donald Trump & Republican \\
+180 & 2020 & Jo Jorgensen & Libertarian \\
+181 & 2020 & Howard Hawkins & Green \\
+\end{longtable}
+
+This discussion begs the question: when should we use \texttt{.loc}
+vs.~\texttt{.iloc}? In most cases, \texttt{.loc} is generally safer to
+use. You can imagine \texttt{.iloc} may return incorrect values when
+applied to a dataset where the ordering of data can change. However,
+\texttt{.iloc} can still be useful --- for example, if you are looking
+at a \texttt{DataFrame} of sorted movie earnings and want to get the
+median earnings for a given year, you can use \texttt{.iloc} to index
+into the middle.
+
+Overall, it is important to remember that:
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{.loc} performances \textbf{l}abel-based extraction.
+\item
+  \texttt{.iloc} performs \textbf{i}nteger-based extraction.
+\end{itemize}
+
+\subsection{\texorpdfstring{Context-dependent Extraction: Indexing with
+\texttt{{[}{]}}}{Context-dependent Extraction: Indexing with {[}{]}}}\label{context-dependent-extraction-indexing-with}
+
+The \texttt{{[}{]}} selection operator is the most baffling of all, yet
+the most commonly used. It only takes a single argument, which may be
+one of the following:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  A slice of row numbers.
+\item
+  A list of column labels.
+\item
+  A single-column label.
+\end{enumerate}
+
+That is, \texttt{{[}{]}} is \emph{context-dependent}. Let's see some
+examples.
+
+\subsubsection{A slice of row numbers}\label{a-slice-of-row-numbers}
+
+Say we wanted the first four rows of our \texttt{elections}
+\texttt{DataFrame}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections[}\DecValTok{0}\NormalTok{:}\DecValTok{4}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.210122 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.789878 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.203927 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.796073 \\
+\end{longtable}
+
+\subsubsection{A list of column labels}\label{a-list-of-column-labels}
+
+Suppose we now want the first four columns.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections[[}\StringTok{"Year"}\NormalTok{, }\StringTok{"Candidate"}\NormalTok{, }\StringTok{"Party"}\NormalTok{, }\StringTok{"Popular vote"}\NormalTok{]]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 \\
+4 & 1832 & Andrew Jackson & Democratic & 702735 \\
+... & ... & ... & ... & ... \\
+177 & 2016 & Jill Stein & Green & 1457226 \\
+178 & 2020 & Joseph Biden & Democratic & 81268924 \\
+179 & 2020 & Donald Trump & Republican & 74216154 \\
+180 & 2020 & Jo Jorgensen & Libertarian & 1865724 \\
+181 & 2020 & Howard Hawkins & Green & 405035 \\
+\end{longtable}
+
+\subsubsection{A single-column label}\label{a-single-column-label}
+
+Lastly, \texttt{{[}{]}} allows us to extract only the
+\texttt{"Candidate"} column.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections[}\StringTok{"Candidate"}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0         Andrew Jackson
+1      John Quincy Adams
+2         Andrew Jackson
+3      John Quincy Adams
+4         Andrew Jackson
+             ...        
+177           Jill Stein
+178         Joseph Biden
+179         Donald Trump
+180         Jo Jorgensen
+181       Howard Hawkins
+Name: Candidate, Length: 182, dtype: object
+\end{verbatim}
+
+The output is a \texttt{Series}! In this course, we'll become very
+comfortable with \texttt{{[}{]}}, especially for selecting columns. In
+practice, \texttt{{[}{]}} is much more common than \texttt{.loc},
+especially since it is far more concise.
+
+\section{Parting Note}\label{parting-note}
+
+The \texttt{pandas} library is enormous and contains many useful
+functions. Here is a link to its
+\href{https://pandas.pydata.org/docs/}{documentation}. We certainly
+don't expect you to memorize each and every method of the library, and
+we will give you a reference sheet for exams.
+
+The introductory Data 100 \texttt{pandas} lectures will provide a
+high-level view of the key data structures and methods that will form
+the foundation of your \texttt{pandas} knowledge. A goal of this course
+is to help you build your familiarity with the real-world programming
+practice of \ldots{} Googling! Answers to your questions can be found in
+documentation, Stack Overflow, etc. Being able to search for, read, and
+implement documentation is an important life skill for any data
+scientist.
+
+With that, we will move on to Pandas II!
+
+\bookmarksetup{startatroot}
+
+\chapter{Pandas II}\label{pandas-ii}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Continue building familiarity with \texttt{pandas} syntax.
+\item
+  Extract data from a \texttt{DataFrame} using conditional selection.
+\item
+  Recognize situations where aggregation is useful and identify the
+  correct technique for performing an aggregation.
+\end{itemize}
+
+\end{tcolorbox}
+
+Last time, we introduced the \texttt{pandas} library as a toolkit for
+processing data. We learned the \texttt{DataFrame} and \texttt{Series}
+data structures, familiarized ourselves with the basic syntax for
+manipulating tabular data, and began writing our first lines of
+\texttt{pandas} code.
+
+In this lecture, we'll start to dive into some advanced \texttt{pandas}
+syntax. You may find it helpful to follow along with a notebook of your
+own as we walk through these new pieces of code.
+
+We'll start by loading the \texttt{babynames} dataset.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# This code pulls census data and loads it into a DataFrame}
+\CommentTok{\# We won\textquotesingle{}t cover it explicitly in this class, but you are welcome to explore it on your own}
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ urllib.request}
+\ImportTok{import}\NormalTok{ os.path}
+\ImportTok{import}\NormalTok{ zipfile}
+
+\NormalTok{data\_url }\OperatorTok{=} \StringTok{"https://www.ssa.gov/oact/babynames/state/namesbystate.zip"}
+\NormalTok{local\_filename }\OperatorTok{=} \StringTok{"data/babynamesbystate.zip"}
+\ControlFlowTok{if} \KeywordTok{not}\NormalTok{ os.path.exists(local\_filename): }\CommentTok{\# If the data exists don\textquotesingle{}t download again}
+    \ControlFlowTok{with}\NormalTok{ urllib.request.urlopen(data\_url) }\ImportTok{as}\NormalTok{ resp, }\BuiltInTok{open}\NormalTok{(local\_filename, }\StringTok{\textquotesingle{}wb\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+\NormalTok{        f.write(resp.read())}
+
+\NormalTok{zf }\OperatorTok{=}\NormalTok{ zipfile.ZipFile(local\_filename, }\StringTok{\textquotesingle{}r\textquotesingle{}}\NormalTok{)}
+
+\NormalTok{ca\_name }\OperatorTok{=} \StringTok{\textquotesingle{}STATE.CA.TXT\textquotesingle{}}
+\NormalTok{field\_names }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}State\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Sex\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Count\textquotesingle{}}\NormalTok{]}
+\ControlFlowTok{with}\NormalTok{ zf.}\BuiltInTok{open}\NormalTok{(ca\_name) }\ImportTok{as}\NormalTok{ fh:}
+\NormalTok{    babynames }\OperatorTok{=}\NormalTok{ pd.read\_csv(fh, header}\OperatorTok{=}\VariableTok{None}\NormalTok{, names}\OperatorTok{=}\NormalTok{field\_names)}
+
+\NormalTok{babynames.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 \\
+1 & CA & F & 1910 & Helen & 239 \\
+2 & CA & F & 1910 & Dorothy & 220 \\
+3 & CA & F & 1910 & Margaret & 163 \\
+4 & CA & F & 1910 & Frances & 134 \\
+\end{longtable}
+
+\section{Conditional Selection}\label{conditional-selection}
+
+Conditional selection allows us to select a subset of rows in a
+\texttt{DataFrame} that satisfy some specified condition.
+
+To understand how to use conditional selection, we must look at another
+possible input of the \texttt{.loc} and \texttt{{[}{]}} methods -- a
+boolean array, which is simply an array or \texttt{Series} where each
+element is either \texttt{True} or \texttt{False}. This boolean array
+must have a length equal to the number of rows in the
+\texttt{DataFrame}. It will return all rows that correspond to a value
+of \texttt{True} in the array. We used a very similar technique when
+performing conditional extraction from a \texttt{Series} in the last
+lecture.
+
+To see this in action, let's select all even-indexed rows in the first
+10 rows of our \texttt{DataFrame}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Ask yourself: why is :9 is the correct slice to select the first 10 rows?}
+\NormalTok{babynames\_first\_10\_rows }\OperatorTok{=}\NormalTok{ babynames.loc[:}\DecValTok{9}\NormalTok{, :]}
+
+\CommentTok{\# Notice how we have exactly 10 elements in our boolean array argument}
+\NormalTok{babynames\_first\_10\_rows[[}\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{]]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 \\
+2 & CA & F & 1910 & Dorothy & 220 \\
+4 & CA & F & 1910 & Frances & 134 \\
+6 & CA & F & 1910 & Evelyn & 126 \\
+8 & CA & F & 1910 & Virginia & 101 \\
+\end{longtable}
+
+We can perform a similar operation using \texttt{.loc}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames\_first\_10\_rows.loc[[}\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{, }\VariableTok{True}\NormalTok{, }\VariableTok{False}\NormalTok{], :]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 \\
+2 & CA & F & 1910 & Dorothy & 220 \\
+4 & CA & F & 1910 & Frances & 134 \\
+6 & CA & F & 1910 & Evelyn & 126 \\
+8 & CA & F & 1910 & Virginia & 101 \\
+\end{longtable}
+
+These techniques worked well in this example, but you can imagine how
+tedious it might be to list out \texttt{True} and \texttt{False}for
+every row in a larger \texttt{DataFrame}. To make things easier, we can
+instead provide a logical condition as an input to \texttt{.loc} or
+\texttt{{[}{]}} that returns a boolean array with the necessary length.
+
+For example, to return all names associated with \texttt{F} sex:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# First, use a logical condition to generate a boolean array}
+\NormalTok{logical\_operator }\OperatorTok{=}\NormalTok{ (babynames[}\StringTok{"Sex"}\NormalTok{] }\OperatorTok{==} \StringTok{"F"}\NormalTok{)}
+
+\CommentTok{\# Then, use this boolean array to filter the DataFrame}
+\NormalTok{babynames[logical\_operator].head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 \\
+1 & CA & F & 1910 & Helen & 239 \\
+2 & CA & F & 1910 & Dorothy & 220 \\
+3 & CA & F & 1910 & Margaret & 163 \\
+4 & CA & F & 1910 & Frances & 134 \\
+\end{longtable}
+
+Recall from the previous lecture that \texttt{.head()} will return only
+the first few rows in the \texttt{DataFrame}. In reality,
+\texttt{babynames{[}logical\ operator{]}} contains as many rows as there
+are entries in the original \texttt{babynames} \texttt{DataFrame} with
+sex \texttt{"F"}.
+
+Here, \texttt{logical\_operator} evaluates to a \texttt{Series} of
+boolean values with length 407428.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\BuiltInTok{print}\NormalTok{(}\StringTok{"There are a total of }\SpecialCharTok{\{\}}\StringTok{ values in \textquotesingle{}logical\_operator\textquotesingle{}"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(}\BuiltInTok{len}\NormalTok{(logical\_operator)))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+There are a total of 407428 values in 'logical_operator'
+\end{verbatim}
+
+Rows starting at row 0 and ending at row 239536 evaluate to
+\texttt{True} and are thus returned in the \texttt{DataFrame}. Rows from
+239537 onwards evaluate to \texttt{False} and are omitted from the
+output.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\BuiltInTok{print}\NormalTok{(}\StringTok{"The 0th item in this \textquotesingle{}logical\_operator\textquotesingle{} is: }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(logical\_operator.iloc[}\DecValTok{0}\NormalTok{]))}
+\BuiltInTok{print}\NormalTok{(}\StringTok{"The 239536th item in this \textquotesingle{}logical\_operator\textquotesingle{} is: }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(logical\_operator.iloc[}\DecValTok{239536}\NormalTok{]))}
+\BuiltInTok{print}\NormalTok{(}\StringTok{"The 239537th item in this \textquotesingle{}logical\_operator\textquotesingle{} is: }\SpecialCharTok{\{\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(logical\_operator.iloc[}\DecValTok{239537}\NormalTok{]))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+The 0th item in this 'logical_operator' is: True
+The 239536th item in this 'logical_operator' is: True
+The 239537th item in this 'logical_operator' is: False
+\end{verbatim}
+
+Passing a \texttt{Series} as an argument to \texttt{babynames{[}{]}} has
+the same effect as using a boolean array. In fact, the \texttt{{[}{]}}
+selection operator can take a boolean \texttt{Series}, array, and list
+as arguments. These three are used interchangeably throughout the
+course.
+
+We can also use \texttt{.loc} to achieve similar results.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames.loc[babynames[}\StringTok{"Sex"}\NormalTok{] }\OperatorTok{==} \StringTok{"F"}\NormalTok{].head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 \\
+1 & CA & F & 1910 & Helen & 239 \\
+2 & CA & F & 1910 & Dorothy & 220 \\
+3 & CA & F & 1910 & Margaret & 163 \\
+4 & CA & F & 1910 & Frances & 134 \\
+\end{longtable}
+
+Boolean conditions can be combined using various bitwise operators,
+allowing us to filter results by multiple conditions. In the table
+below, p and q are boolean arrays or \texttt{Series}.
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+Symbol & Usage & Meaning \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\textasciitilde{} & \textasciitilde p & Returns negation of p \\
+\textbar{} & p \textbar{} q & p OR q \\
+\& & p \& q & p AND q \\
+\^{} & p \^{} q & p XOR q (exclusive or) \\
+\end{longtable}
+
+When combining multiple conditions with logical operators, we surround
+each individual condition with a set of parenthesis \texttt{()}. This
+imposes an order of operations on \texttt{pandas} evaluating your logic
+and can avoid code erroring.
+
+For example, if we want to return data on all names with sex
+\texttt{"F"} born before the year 2000, we can write:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames[(babynames[}\StringTok{"Sex"}\NormalTok{] }\OperatorTok{==} \StringTok{"F"}\NormalTok{) }\OperatorTok{\&}\NormalTok{ (babynames[}\StringTok{"Year"}\NormalTok{] }\OperatorTok{\textless{}} \DecValTok{2000}\NormalTok{)].head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 \\
+1 & CA & F & 1910 & Helen & 239 \\
+2 & CA & F & 1910 & Dorothy & 220 \\
+3 & CA & F & 1910 & Margaret & 163 \\
+4 & CA & F & 1910 & Frances & 134 \\
+\end{longtable}
+
+Note that we're working with \texttt{Series}, so using \texttt{and} in
+place of \texttt{\&}, or \texttt{or} in place \texttt{\textbar{}} will
+error.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# This line of code will raise a ValueError}
+\CommentTok{\# babynames[(babynames["Sex"] == "F") and (babynames["Year"] \textless{} 2000)].head()}
+\end{Highlighting}
+\end{Shaded}
+
+If we want to return data on all names with sex \texttt{"F"} \emph{or}
+all born before the year 2000, we can write:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames[(babynames[}\StringTok{"Sex"}\NormalTok{] }\OperatorTok{==} \StringTok{"F"}\NormalTok{) }\OperatorTok{|}\NormalTok{ (babynames[}\StringTok{"Year"}\NormalTok{] }\OperatorTok{\textless{}} \DecValTok{2000}\NormalTok{)].head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 \\
+1 & CA & F & 1910 & Helen & 239 \\
+2 & CA & F & 1910 & Dorothy & 220 \\
+3 & CA & F & 1910 & Margaret & 163 \\
+4 & CA & F & 1910 & Frances & 134 \\
+\end{longtable}
+
+Boolean array selection is a useful tool, but can lead to overly verbose
+code for complex conditions. In the example below, our boolean condition
+is long enough to extend for several lines of code.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Note: The parentheses surrounding the code make it possible to break the code on to multiple lines for readability}
+\NormalTok{(}
+\NormalTok{    babynames[(babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Bella"}\NormalTok{) }\OperatorTok{|} 
+\NormalTok{              (babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Alex"}\NormalTok{) }\OperatorTok{|}
+\NormalTok{              (babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Ani"}\NormalTok{) }\OperatorTok{|}
+\NormalTok{              (babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Lisa"}\NormalTok{)]}
+\NormalTok{).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+6289 & CA & F & 1923 & Bella & 5 \\
+7512 & CA & F & 1925 & Bella & 8 \\
+12368 & CA & F & 1932 & Lisa & 5 \\
+14741 & CA & F & 1936 & Lisa & 8 \\
+17084 & CA & F & 1939 & Lisa & 5 \\
+\end{longtable}
+
+Fortunately, \texttt{pandas} provides many alternative methods for
+constructing boolean filters.
+
+The \texttt{.isin} function is one such example. This method evaluates
+if the values in a \texttt{Series} are contained in a different sequence
+(list, array, or \texttt{Series}) of values. In the cell below, we
+achieve equivalent results to the \texttt{DataFrame} above with far more
+concise code.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{names }\OperatorTok{=}\NormalTok{ [}\StringTok{"Bella"}\NormalTok{, }\StringTok{"Alex"}\NormalTok{, }\StringTok{"Narges"}\NormalTok{, }\StringTok{"Lisa"}\NormalTok{]}
+\NormalTok{babynames[}\StringTok{"Name"}\NormalTok{].isin(names).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0    False
+1    False
+2    False
+3    False
+4    False
+Name: Name, dtype: bool
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames[babynames[}\StringTok{"Name"}\NormalTok{].isin(names)].head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+6289 & CA & F & 1923 & Bella & 5 \\
+7512 & CA & F & 1925 & Bella & 8 \\
+12368 & CA & F & 1932 & Lisa & 5 \\
+14741 & CA & F & 1936 & Lisa & 8 \\
+17084 & CA & F & 1939 & Lisa & 5 \\
+\end{longtable}
+
+The function \texttt{str.startswith} can be used to define a filter
+based on string values in a \texttt{Series} object. It checks to see if
+string values in a \texttt{Series} start with a particular character.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Identify whether names begin with the letter "N"}
+\NormalTok{babynames[}\StringTok{"Name"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.startswith(}\StringTok{"N"}\NormalTok{).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0    False
+1    False
+2    False
+3    False
+4    False
+Name: Name, dtype: bool
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Extracting names that begin with the letter "N"}
+\NormalTok{babynames[babynames[}\StringTok{"Name"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.startswith(}\StringTok{"N"}\NormalTok{)].head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+76 & CA & F & 1910 & Norma & 23 \\
+83 & CA & F & 1910 & Nellie & 20 \\
+127 & CA & F & 1910 & Nina & 11 \\
+198 & CA & F & 1910 & Nora & 6 \\
+310 & CA & F & 1911 & Nellie & 23 \\
+\end{longtable}
+
+\section{Adding, Removing, and Modifying
+Columns}\label{adding-removing-and-modifying-columns}
+
+In many data science tasks, we may need to change the columns contained
+in our \texttt{DataFrame} in some way. Fortunately, the syntax to do so
+is fairly straightforward.
+
+To add a new column to a \texttt{DataFrame}, we use a syntax similar to
+that used when accessing an existing column. Specify the name of the new
+column by writing \texttt{df{[}"column"{]}}, then assign this to a
+\texttt{Series} or array containing the values that will populate this
+column.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Create a Series of the length of each name. }
+\NormalTok{babyname\_lengths }\OperatorTok{=}\NormalTok{ babynames[}\StringTok{"Name"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.}\BuiltInTok{len}\NormalTok{()}
+
+\CommentTok{\# Add a column named "name\_lengths" that includes the length of each name}
+\NormalTok{babynames[}\StringTok{"name\_lengths"}\NormalTok{] }\OperatorTok{=}\NormalTok{ babyname\_lengths}
+\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count & name\_lengths \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 & 4 \\
+1 & CA & F & 1910 & Helen & 239 & 5 \\
+2 & CA & F & 1910 & Dorothy & 220 & 7 \\
+3 & CA & F & 1910 & Margaret & 163 & 8 \\
+4 & CA & F & 1910 & Frances & 134 & 7 \\
+\end{longtable}
+
+If we need to later modify an existing column, we can do so by
+referencing this column again with the syntax \texttt{df{[}"column"{]}},
+then re-assigning it to a new \texttt{Series} or array of the
+appropriate length.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Modify the “name\_lengths” column to be one less than its original value}
+\NormalTok{babynames[}\StringTok{"name\_lengths"}\NormalTok{] }\OperatorTok{=}\NormalTok{ babynames[}\StringTok{"name\_lengths"}\NormalTok{] }\OperatorTok{{-}} \DecValTok{1}
+\NormalTok{babynames.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count & name\_lengths \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 & 3 \\
+1 & CA & F & 1910 & Helen & 239 & 4 \\
+2 & CA & F & 1910 & Dorothy & 220 & 6 \\
+3 & CA & F & 1910 & Margaret & 163 & 7 \\
+4 & CA & F & 1910 & Frances & 134 & 6 \\
+\end{longtable}
+
+We can rename a column using the \texttt{.rename()} method. It takes in
+a dictionary that maps old column names to their new ones.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Rename “name\_lengths” to “Length”}
+\NormalTok{babynames }\OperatorTok{=}\NormalTok{ babynames.rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{"name\_lengths"}\NormalTok{:}\StringTok{"Length"}\NormalTok{\})}
+\NormalTok{babynames.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count & Length \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 & 3 \\
+1 & CA & F & 1910 & Helen & 239 & 4 \\
+2 & CA & F & 1910 & Dorothy & 220 & 6 \\
+3 & CA & F & 1910 & Margaret & 163 & 7 \\
+4 & CA & F & 1910 & Frances & 134 & 6 \\
+\end{longtable}
+
+If we want to remove a column or row of a \texttt{DataFrame}, we can
+call the \texttt{.drop}
+\href{https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html}{(documentation)}
+method. Use the \texttt{axis} parameter to specify whether a column or
+row should be dropped. Unless otherwise specified, \texttt{pandas} will
+assume that we are dropping a row by default.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Drop our new "Length" column from the DataFrame}
+\NormalTok{babynames }\OperatorTok{=}\NormalTok{ babynames.drop(}\StringTok{"Length"}\NormalTok{, axis}\OperatorTok{=}\StringTok{"columns"}\NormalTok{)}
+\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 \\
+1 & CA & F & 1910 & Helen & 239 \\
+2 & CA & F & 1910 & Dorothy & 220 \\
+3 & CA & F & 1910 & Margaret & 163 \\
+4 & CA & F & 1910 & Frances & 134 \\
+\end{longtable}
+
+Notice that we \emph{re-assigned} \texttt{babynames} to the result of
+\texttt{babynames.drop(...)}. This is a subtle but important point:
+\texttt{pandas} table operations \textbf{do not occur in-place}. Calling
+\texttt{df.drop(...)} will output a \emph{copy} of \texttt{df} with the
+row/column of interest removed without modifying the original
+\texttt{df} table.
+
+In other words, if we simply call:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# This creates a copy of \textasciigrave{}babynames\textasciigrave{} and removes the column "Name"...}
+\NormalTok{babynames.drop(}\StringTok{"Name"}\NormalTok{, axis}\OperatorTok{=}\StringTok{"columns"}\NormalTok{)}
+
+\CommentTok{\# ...but the original \textasciigrave{}babynames\textasciigrave{} is unchanged! }
+\CommentTok{\# Notice that the "Name" column is still present}
+\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 \\
+1 & CA & F & 1910 & Helen & 239 \\
+2 & CA & F & 1910 & Dorothy & 220 \\
+3 & CA & F & 1910 & Margaret & 163 \\
+4 & CA & F & 1910 & Frances & 134 \\
+\end{longtable}
+
+\section{Useful Utility Functions}\label{useful-utility-functions}
+
+\texttt{pandas} contains an extensive library of functions that can help
+shorten the process of setting and getting information from its data
+structures. In the following section, we will give overviews of each of
+the main utility functions that will help us in Data 100.
+
+Discussing all functionality offered by \texttt{pandas} could take an
+entire semester! We will walk you through the most commonly-used
+functions and encourage you to explore and experiment on your own.
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{NumPy} and built-in function support
+\item
+  \texttt{.shape}
+\item
+  \texttt{.size}
+\item
+  \texttt{.describe()}
+\item
+  \texttt{.sample()}
+\item
+  \texttt{.value\_counts()}
+\item
+  \texttt{.unique()}
+\item
+  \texttt{.sort\_values()}
+\end{itemize}
+
+The \texttt{pandas}
+\href{https://pandas.pydata.org/docs/reference/index.html}{documentation}
+will be a valuable resource in Data 100 and beyond.
+
+\subsection{\texorpdfstring{\texttt{NumPy}}{NumPy}}\label{numpy}
+
+\texttt{pandas} is designed to work well with \texttt{NumPy}, the
+framework for array computations you encountered in
+\href{https://www.data8.org/su23/reference/\#array-functions-and-methods}{Data
+8}. Just about any \texttt{NumPy} function can be applied to
+\texttt{pandas} \texttt{DataFrame}s and \texttt{Series}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Pull out the number of babies named Yash each year}
+\NormalTok{yash\_count }\OperatorTok{=}\NormalTok{ babynames[babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Yash"}\NormalTok{][}\StringTok{"Count"}\NormalTok{]}
+\NormalTok{yash\_count.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+331824     8
+334114     9
+336390    11
+338773    12
+341387    10
+Name: Count, dtype: int64
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Average number of babies named Yash each year}
+\NormalTok{np.mean(yash\_count)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(17.142857142857142)
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Max number of babies named Yash born in any one year}
+\NormalTok{np.}\BuiltInTok{max}\NormalTok{(yash\_count)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.int64(29)
+\end{verbatim}
+
+\subsection{\texorpdfstring{\texttt{.shape} and
+\texttt{.size}}{.shape and .size}}\label{shape-and-.size}
+
+\texttt{.shape} and \texttt{.size} are attributes of \texttt{Series} and
+\texttt{DataFrame}s that measure the ``amount'' of data stored in the
+structure. Calling \texttt{.shape} returns a tuple containing the number
+of rows and columns present in the \texttt{DataFrame} or
+\texttt{Series}. \texttt{.size} is used to find the total number of
+elements in a structure, equivalent to the number of rows times the
+number of columns.
+
+Many functions strictly require the dimensions of the arguments along
+certain axes to match. Calling these dimension-finding functions is much
+faster than counting all of the items by hand.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Return the shape of the DataFrame, in the format (num\_rows, num\_columns)}
+\NormalTok{babynames.shape}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(407428, 5)
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Return the size of the DataFrame, equal to num\_rows * num\_columns}
+\NormalTok{babynames.size}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+2037140
+\end{verbatim}
+
+\subsection{\texorpdfstring{\texttt{.describe()}}{.describe()}}\label{describe}
+
+If many statistics are required from a \texttt{DataFrame} (minimum
+value, maximum value, mean value, etc.), then \texttt{.describe()}
+\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html}{(documentation)}
+can be used to compute all of them at once.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames.describe()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Year & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+count & 407428.000000 & 407428.000000 \\
+mean & 1985.733609 & 79.543456 \\
+std & 27.007660 & 293.698654 \\
+min & 1910.000000 & 5.000000 \\
+25\% & 1969.000000 & 7.000000 \\
+50\% & 1992.000000 & 13.000000 \\
+75\% & 2008.000000 & 38.000000 \\
+max & 2022.000000 & 8260.000000 \\
+\end{longtable}
+
+A different set of statistics will be reported if \texttt{.describe()}
+is called on a \texttt{Series}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames[}\StringTok{"Sex"}\NormalTok{].describe()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+count     407428
+unique         2
+top            F
+freq      239537
+Name: Sex, dtype: object
+\end{verbatim}
+
+\subsection{\texorpdfstring{\texttt{.sample()}}{.sample()}}\label{sample}
+
+As we will see later in the semester, random processes are at the heart
+of many data science techniques (for example, train-test splits,
+bootstrapping, and cross-validation). \texttt{.sample()}
+\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html}{(documentation)}
+lets us quickly select random entries (a row if called from a
+\texttt{DataFrame}, or a value if called from a \texttt{Series}).
+
+By default, \texttt{.sample()} selects entries \emph{without}
+replacement. Pass in the argument \texttt{replace=True} to sample with
+replacement.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Sample a single row}
+\NormalTok{babynames.sample()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+186466 & CA & F & 2009 & Karol & 26 \\
+\end{longtable}
+
+Naturally, this can be chained with other methods and operators
+(\texttt{iloc}, etc.).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Sample 5 random rows, and select all columns after column 2}
+\NormalTok{babynames.sample(}\DecValTok{5}\NormalTok{).iloc[:, }\DecValTok{2}\NormalTok{:]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+293565 & 1977 & Dominique & 17 \\
+22880 & 1946 & Trudy & 70 \\
+285883 & 1972 & William & 1930 \\
+144771 & 1998 & Hayleigh & 6 \\
+184846 & 2008 & Graci & 5 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Randomly sample 4 names from the year 2000, with replacement, and select all columns after column 2}
+\NormalTok{babynames[babynames[}\StringTok{"Year"}\NormalTok{] }\OperatorTok{==} \DecValTok{2000}\NormalTok{].sample(}\DecValTok{4}\NormalTok{, replace }\OperatorTok{=} \VariableTok{True}\NormalTok{).iloc[:, }\DecValTok{2}\NormalTok{:]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+343283 & 2000 & Arian & 21 \\
+343478 & 2000 & Terence & 16 \\
+344052 & 2000 & Gaven & 8 \\
+150023 & 2000 & Kiersten & 31 \\
+\end{longtable}
+
+\subsection{\texorpdfstring{\texttt{.value\_counts()}}{.value\_counts()}}\label{value_counts}
+
+The \texttt{Series.value\_counts()}
+\href{https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html}{(documentation)}
+method counts the number of occurrence of each unique value in a
+\texttt{Series}. In other words, it \emph{counts} the number of times
+each unique \emph{value} appears. This is often useful for determining
+the most or least common entries in a \texttt{Series}.
+
+In the example below, we can determine the name with the most years in
+which at least one person has taken that name by counting the number of
+times each name appears in the \texttt{"Name"} column of
+\texttt{babynames}. Note that the return value is also a
+\texttt{Series}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames[}\StringTok{"Name"}\NormalTok{].value\_counts().head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Name
+Jean         223
+Francis      221
+Guadalupe    218
+Jessie       217
+Marion       214
+Name: count, dtype: int64
+\end{verbatim}
+
+\subsection{\texorpdfstring{\texttt{.unique()}}{.unique()}}\label{unique}
+
+If we have a \texttt{Series} with many repeated values, then
+\texttt{.unique()}
+\href{https://pandas.pydata.org/docs/reference/api/pandas.unique.html}{(documentation)}
+can be used to identify only the \emph{unique} values. Here we return an
+array of all the names in \texttt{babynames}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames[}\StringTok{"Name"}\NormalTok{].unique()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array(['Mary', 'Helen', 'Dorothy', ..., 'Zae', 'Zai', 'Zayvier'],
+      dtype=object)
+\end{verbatim}
+
+\subsection{\texorpdfstring{\texttt{.sort\_values()}}{.sort\_values()}}\label{sort_values}
+
+Ordering a \texttt{DataFrame} can be useful for isolating extreme
+values. For example, the first 5 entries of a row sorted in descending
+order (that is, from highest to lowest) are the largest 5 values.
+\texttt{.sort\_values}
+\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html}{(documentation)}
+allows us to order a \texttt{DataFrame} or \texttt{Series} by a
+specified column. We can choose to either receive the rows in
+\texttt{ascending} order (default) or \texttt{descending} order.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Sort the "Count" column from highest to lowest}
+\NormalTok{babynames.sort\_values(by}\OperatorTok{=}\StringTok{"Count"}\NormalTok{, ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+268041 & CA & M & 1957 & Michael & 8260 \\
+267017 & CA & M & 1956 & Michael & 8258 \\
+317387 & CA & M & 1990 & Michael & 8246 \\
+281850 & CA & M & 1969 & Michael & 8245 \\
+283146 & CA & M & 1970 & Michael & 8196 \\
+\end{longtable}
+
+Unlike when calling \texttt{.value\_counts()} on a \texttt{DataFrame},
+we do not need to explicitly specify the column used for sorting when
+calling \texttt{.value\_counts()} on a \texttt{Series}. We can still
+specify the ordering paradigm -- that is, whether values are sorted in
+ascending or descending order.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Sort the "Name" Series alphabetically}
+\NormalTok{babynames[}\StringTok{"Name"}\NormalTok{].sort\_values(ascending}\OperatorTok{=}\VariableTok{True}\NormalTok{).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+366001      Aadan
+384005      Aadan
+369120      Aadan
+398211    Aadarsh
+370306      Aaden
+Name: Name, dtype: object
+\end{verbatim}
+
+\section{Parting Note}\label{parting-note-1}
+
+Manipulating \texttt{DataFrames} is not a skill that is mastered in just
+one day. Due to the flexibility of \texttt{pandas}, there are many
+different ways to get from point A to point B. We recommend trying
+multiple different ways to solve the same problem to gain even more
+practice and reach that point of mastery sooner.
+
+Next, we will start digging deeper into the mechanics behind grouping
+data.
+
+\bookmarksetup{startatroot}
+
+\chapter{Pandas III}\label{pandas-iii}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Perform advanced aggregation using \texttt{.groupby()}
+\item
+  Use the \texttt{pd.pivot\_table} method to construct a pivot table
+\item
+  Perform simple merges between DataFrames using \texttt{pd.merge()}
+\end{itemize}
+
+\end{tcolorbox}
+
+We will introduce the concept of aggregating data -- we will familiarize
+ourselves with \texttt{GroupBy} objects and used them as tools to
+consolidate and summarize a\texttt{DataFrame}. In this lecture, we will
+explore working with the different aggregation functions and dive into
+some advanced \texttt{.groupby} methods to show just how powerful of a
+resource they can be for understanding our data. We will also introduce
+other techniques for data aggregation to provide flexibility in how we
+manipulate our tables.
+
+\section{Custom Sorts}\label{custom-sorts}
+
+First, let's finish our discussion about sorting. Let's try to solve a
+sorting problem using different approaches. Assume we want to find the
+longest baby names and sort our data accordingly.
+
+We'll start by loading the \texttt{babynames} dataset. Note that this
+dataset is filtered to only contain data from California.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# This code pulls census data and loads it into a DataFrame}
+\CommentTok{\# We won\textquotesingle{}t cover it explicitly in this class, but you are welcome to explore it on your own}
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ urllib.request}
+\ImportTok{import}\NormalTok{ os.path}
+\ImportTok{import}\NormalTok{ zipfile}
+
+\NormalTok{data\_url }\OperatorTok{=} \StringTok{"https://www.ssa.gov/oact/babynames/state/namesbystate.zip"}
+\NormalTok{local\_filename }\OperatorTok{=} \StringTok{"data/babynamesbystate.zip"}
+\ControlFlowTok{if} \KeywordTok{not}\NormalTok{ os.path.exists(local\_filename): }\CommentTok{\# If the data exists don\textquotesingle{}t download again}
+    \ControlFlowTok{with}\NormalTok{ urllib.request.urlopen(data\_url) }\ImportTok{as}\NormalTok{ resp, }\BuiltInTok{open}\NormalTok{(local\_filename, }\StringTok{\textquotesingle{}wb\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+\NormalTok{        f.write(resp.read())}
+
+\NormalTok{zf }\OperatorTok{=}\NormalTok{ zipfile.ZipFile(local\_filename, }\StringTok{\textquotesingle{}r\textquotesingle{}}\NormalTok{)}
+
+\NormalTok{ca\_name }\OperatorTok{=} \StringTok{\textquotesingle{}STATE.CA.TXT\textquotesingle{}}
+\NormalTok{field\_names }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}State\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Sex\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Count\textquotesingle{}}\NormalTok{]}
+\ControlFlowTok{with}\NormalTok{ zf.}\BuiltInTok{open}\NormalTok{(ca\_name) }\ImportTok{as}\NormalTok{ fh:}
+\NormalTok{    babynames }\OperatorTok{=}\NormalTok{ pd.read\_csv(fh, header}\OperatorTok{=}\VariableTok{None}\NormalTok{, names}\OperatorTok{=}\NormalTok{field\_names)}
+
+\NormalTok{babynames.tail(}\DecValTok{10}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+407418 & CA & M & 2022 & Zach & 5 \\
+407419 & CA & M & 2022 & Zadkiel & 5 \\
+407420 & CA & M & 2022 & Zae & 5 \\
+407421 & CA & M & 2022 & Zai & 5 \\
+407422 & CA & M & 2022 & Zay & 5 \\
+407423 & CA & M & 2022 & Zayvier & 5 \\
+407424 & CA & M & 2022 & Zia & 5 \\
+407425 & CA & M & 2022 & Zora & 5 \\
+407426 & CA & M & 2022 & Zuriel & 5 \\
+407427 & CA & M & 2022 & Zylo & 5 \\
+\end{longtable}
+
+\subsection{Approach 1: Create a Temporary
+Column}\label{approach-1-create-a-temporary-column}
+
+One method to do this is to first start by creating a column that
+contains the lengths of the names.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Create a Series of the length of each name}
+\NormalTok{babyname\_lengths }\OperatorTok{=}\NormalTok{ babynames[}\StringTok{"Name"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.}\BuiltInTok{len}\NormalTok{()}
+
+\CommentTok{\# Add a column named "name\_lengths" that includes the length of each name}
+\NormalTok{babynames[}\StringTok{"name\_lengths"}\NormalTok{] }\OperatorTok{=}\NormalTok{ babyname\_lengths}
+\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count & name\_lengths \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & CA & F & 1910 & Mary & 295 & 4 \\
+1 & CA & F & 1910 & Helen & 239 & 5 \\
+2 & CA & F & 1910 & Dorothy & 220 & 7 \\
+3 & CA & F & 1910 & Margaret & 163 & 8 \\
+4 & CA & F & 1910 & Frances & 134 & 7 \\
+\end{longtable}
+
+We can then sort the \texttt{DataFrame} by that column using
+\texttt{.sort\_values()}:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Sort by the temporary column}
+\NormalTok{babynames }\OperatorTok{=}\NormalTok{ babynames.sort\_values(by}\OperatorTok{=}\StringTok{"name\_lengths"}\NormalTok{, ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count & name\_lengths \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+334166 & CA & M & 1996 & Franciscojavier & 8 & 15 \\
+337301 & CA & M & 1997 & Franciscojavier & 5 & 15 \\
+339472 & CA & M & 1998 & Franciscojavier & 6 & 15 \\
+321792 & CA & M & 1991 & Ryanchristopher & 7 & 15 \\
+327358 & CA & M & 1993 & Johnchristopher & 5 & 15 \\
+\end{longtable}
+
+Finally, we can drop the \texttt{name\_length} column from
+\texttt{babynames} to prevent our table from getting cluttered.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Drop the \textquotesingle{}name\_length\textquotesingle{} column}
+\NormalTok{babynames }\OperatorTok{=}\NormalTok{ babynames.drop(}\StringTok{"name\_lengths"}\NormalTok{, axis}\OperatorTok{=}\StringTok{\textquotesingle{}columns\textquotesingle{}}\NormalTok{)}
+\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+334166 & CA & M & 1996 & Franciscojavier & 8 \\
+337301 & CA & M & 1997 & Franciscojavier & 5 \\
+339472 & CA & M & 1998 & Franciscojavier & 6 \\
+321792 & CA & M & 1991 & Ryanchristopher & 7 \\
+327358 & CA & M & 1993 & Johnchristopher & 5 \\
+\end{longtable}
+
+\subsection{\texorpdfstring{Approach 2: Sorting using the \texttt{key}
+Argument}{Approach 2: Sorting using the key Argument}}\label{approach-2-sorting-using-the-key-argument}
+
+Another way to approach this is to use the \texttt{key} argument of
+\texttt{.sort\_values()}. Here we can specify that we want to sort
+\texttt{"Name"} values by their length.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames.sort\_values(}\StringTok{"Name"}\NormalTok{, key}\OperatorTok{=}\KeywordTok{lambda}\NormalTok{ x: x.}\BuiltInTok{str}\NormalTok{.}\BuiltInTok{len}\NormalTok{(), ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+334166 & CA & M & 1996 & Franciscojavier & 8 \\
+327472 & CA & M & 1993 & Ryanchristopher & 5 \\
+337301 & CA & M & 1997 & Franciscojavier & 5 \\
+337477 & CA & M & 1997 & Ryanchristopher & 5 \\
+312543 & CA & M & 1987 & Franciscojavier & 5 \\
+\end{longtable}
+
+\subsection{\texorpdfstring{Approach 3: Sorting using the \texttt{map}
+Function}{Approach 3: Sorting using the map Function}}\label{approach-3-sorting-using-the-map-function}
+
+We can also use the \texttt{map} function on a \texttt{Series} to solve
+this. Say we want to sort the \texttt{babynames} table by the number of
+\texttt{"dr"}'s and \texttt{"ea"}'s in each \texttt{"Name"}. We'll
+define the function \texttt{dr\_ea\_count} to help us out.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# First, define a function to count the number of times "dr" or "ea" appear in each name}
+\KeywordTok{def}\NormalTok{ dr\_ea\_count(string):}
+    \ControlFlowTok{return}\NormalTok{ string.count(}\StringTok{\textquotesingle{}dr\textquotesingle{}}\NormalTok{) }\OperatorTok{+}\NormalTok{ string.count(}\StringTok{\textquotesingle{}ea\textquotesingle{}}\NormalTok{)}
+
+\CommentTok{\# Then, use \textasciigrave{}map\textasciigrave{} to apply \textasciigrave{}dr\_ea\_count\textasciigrave{} to each name in the "Name" column}
+\NormalTok{babynames[}\StringTok{"dr\_ea\_count"}\NormalTok{] }\OperatorTok{=}\NormalTok{ babynames[}\StringTok{"Name"}\NormalTok{].}\BuiltInTok{map}\NormalTok{(dr\_ea\_count)}
+
+\CommentTok{\# Sort the DataFrame by the new "dr\_ea\_count" column so we can see our handiwork}
+\NormalTok{babynames }\OperatorTok{=}\NormalTok{ babynames.sort\_values(by}\OperatorTok{=}\StringTok{"dr\_ea\_count"}\NormalTok{, ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+\NormalTok{babynames.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count & dr\_ea\_count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+115957 & CA & F & 1990 & Deandrea & 5 & 3 \\
+101976 & CA & F & 1986 & Deandrea & 6 & 3 \\
+131029 & CA & F & 1994 & Leandrea & 5 & 3 \\
+108731 & CA & F & 1988 & Deandrea & 5 & 3 \\
+308131 & CA & M & 1985 & Deandrea & 6 & 3 \\
+\end{longtable}
+
+We can drop the \texttt{dr\_ea\_count} once we're done using it to
+maintain a neat table.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Drop the \textasciigrave{}dr\_ea\_count\textasciigrave{} column}
+\NormalTok{babynames }\OperatorTok{=}\NormalTok{ babynames.drop(}\StringTok{"dr\_ea\_count"}\NormalTok{, axis }\OperatorTok{=} \StringTok{\textquotesingle{}columns\textquotesingle{}}\NormalTok{)}
+\NormalTok{babynames.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+115957 & CA & F & 1990 & Deandrea & 5 \\
+101976 & CA & F & 1986 & Deandrea & 6 \\
+131029 & CA & F & 1994 & Leandrea & 5 \\
+108731 & CA & F & 1988 & Deandrea & 5 \\
+308131 & CA & M & 1985 & Deandrea & 6 \\
+\end{longtable}
+
+\section{\texorpdfstring{Aggregating Data with
+\texttt{.groupby}}{Aggregating Data with .groupby}}\label{aggregating-data-with-.groupby}
+
+Up until this point, we have been working with individual rows of
+\texttt{DataFrame}s. As data scientists, we often wish to investigate
+trends across a larger \emph{subset} of our data. For example, we may
+want to compute some summary statistic (the mean, median, sum, etc.) for
+a group of rows in our \texttt{DataFrame}. To do this, we'll use
+\texttt{pandas} \texttt{GroupBy} objects. Our goal is to group together
+rows that fall under the same category and perform an operation that
+aggregates across all rows in the category.
+
+Let's say we wanted to aggregate all rows in \texttt{babynames} for a
+given year.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames.groupby(}\StringTok{"Year"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+<pandas.core.groupby.generic.DataFrameGroupBy object at 0x103adf010>
+\end{verbatim}
+
+What does this strange output mean? Calling \texttt{.groupby}
+\href{https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html}{(documentation)}
+has generated a \texttt{GroupBy} object. You can imagine this as a set
+of ``mini'' sub-\texttt{DataFrame}s, where each subframe contains all of
+the rows from \texttt{babynames} that correspond to a particular year.
+
+The diagram below shows a simplified view of \texttt{babynames} to help
+illustrate this idea.
+
+We can't work with a \texttt{GroupBy} object directly -- that is why you
+saw that strange output earlier rather than a standard view of a
+\texttt{DataFrame}. To actually manipulate values within these ``mini''
+\texttt{DataFrame}s, we'll need to call an \emph{aggregation method}.
+This is a method that tells \texttt{pandas} how to aggregate the values
+within the \texttt{GroupBy} object. Once the aggregation is applied,
+\texttt{pandas} will return a normal (now grouped) \texttt{DataFrame}.
+
+The first aggregation method we'll consider is \texttt{.agg}. The
+\texttt{.agg} method takes in a function as its argument; this function
+is then applied to each column of a ``mini'' grouped DataFrame. We end
+up with a new \texttt{DataFrame} with one aggregated row per subframe.
+Let's see this in action by finding the \texttt{sum} of all counts for
+each year in \texttt{babynames} -- this is equivalent to finding the
+number of babies born in each year.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames[[}\StringTok{"Year"}\NormalTok{, }\StringTok{"Count"}\NormalTok{]].groupby(}\StringTok{"Year"}\NormalTok{).agg(}\BuiltInTok{sum}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/2718070104.py:1: FutureWarning:
+
+The provided callable <built-in function sum> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Count \\
+Year & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1910 & 9163 \\
+1911 & 9983 \\
+1912 & 17946 \\
+1913 & 22094 \\
+1914 & 26926 \\
+\end{longtable}
+
+We can relate this back to the diagram we used above. Remember that the
+diagram uses a simplified version of \texttt{babynames}, which is why we
+see smaller values for the summed counts.
+
+\begin{figure}[H]
+
+{\centering \includegraphics{pandas_3/images/agg.png}
+
+}
+
+\caption{Performing an aggregation}
+
+\end{figure}%
+
+Calling \texttt{.agg} has condensed each subframe back into a single
+row. This gives us our final output: a \texttt{DataFrame} that is now
+indexed by \texttt{"Year"}, with a single row for each unique year in
+the original \texttt{babynames} DataFrame.
+
+There are many different aggregation functions we can use, all of which
+are useful in different applications.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames[[}\StringTok{"Year"}\NormalTok{, }\StringTok{"Count"}\NormalTok{]].groupby(}\StringTok{"Year"}\NormalTok{).agg(}\BuiltInTok{min}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/86785752.py:1: FutureWarning:
+
+The provided callable <built-in function min> is currently using DataFrameGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "min" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Count \\
+Year & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1910 & 5 \\
+1911 & 5 \\
+1912 & 5 \\
+1913 & 5 \\
+1914 & 5 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames[[}\StringTok{"Year"}\NormalTok{, }\StringTok{"Count"}\NormalTok{]].groupby(}\StringTok{"Year"}\NormalTok{).agg(}\BuiltInTok{max}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/3032256904.py:1: FutureWarning:
+
+The provided callable <built-in function max> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Count \\
+Year & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1910 & 295 \\
+1911 & 390 \\
+1912 & 534 \\
+1913 & 614 \\
+1914 & 773 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Same result, but now we explicitly tell pandas to only consider the "Count" column when summing}
+\NormalTok{babynames.groupby(}\StringTok{"Year"}\NormalTok{)[[}\StringTok{"Count"}\NormalTok{]].agg(}\BuiltInTok{sum}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/1958904241.py:2: FutureWarning:
+
+The provided callable <built-in function sum> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Count \\
+Year & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1910 & 9163 \\
+1911 & 9983 \\
+1912 & 17946 \\
+1913 & 22094 \\
+1914 & 26926 \\
+\end{longtable}
+
+There are many different aggregations that can be applied to the grouped
+data. The primary requirement is that an aggregation function must:
+
+\begin{itemize}
+\tightlist
+\item
+  Take in a \texttt{Series} of data (a single column of the grouped
+  subframe).
+\item
+  Return a single value that aggregates this \texttt{Series}.
+\end{itemize}
+
+\subsection{Aggregation Functions}\label{aggregation-functions}
+
+Because of this fairly broad requirement, \texttt{pandas} offers many
+ways of computing an aggregation.
+
+\textbf{In-built} Python operations -- such as \texttt{sum},
+\texttt{max}, and \texttt{min} -- are automatically recognized by
+\texttt{pandas}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# What is the minimum count for each name in any year?}
+\NormalTok{babynames.groupby(}\StringTok{"Name"}\NormalTok{)[[}\StringTok{"Count"}\NormalTok{]].agg(}\BuiltInTok{min}\NormalTok{).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/3244314896.py:2: FutureWarning:
+
+The provided callable <built-in function min> is currently using DataFrameGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "min" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Count \\
+Name & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Aadan & 5 \\
+Aadarsh & 6 \\
+Aaden & 10 \\
+Aadhav & 6 \\
+Aadhini & 6 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# What is the largest single{-}year count of each name?}
+\NormalTok{babynames.groupby(}\StringTok{"Name"}\NormalTok{)[[}\StringTok{"Count"}\NormalTok{]].agg(}\BuiltInTok{max}\NormalTok{).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/3805876622.py:2: FutureWarning:
+
+The provided callable <built-in function max> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Count \\
+Name & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Aadan & 7 \\
+Aadarsh & 6 \\
+Aaden & 158 \\
+Aadhav & 8 \\
+Aadhini & 6 \\
+\end{longtable}
+
+As mentioned previously, functions from the \texttt{NumPy} library, such
+as \texttt{np.mean}, \texttt{np.max}, \texttt{np.min}, and
+\texttt{np.sum}, are also fair game in \texttt{pandas}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# What is the average count for each name across all years?}
+\NormalTok{babynames.groupby(}\StringTok{"Name"}\NormalTok{)[[}\StringTok{"Count"}\NormalTok{]].agg(np.mean).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/308986604.py:2: FutureWarning:
+
+The provided callable <function mean at 0x103c75360> is currently using DataFrameGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Count \\
+Name & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Aadan & 6.000000 \\
+Aadarsh & 6.000000 \\
+Aaden & 46.214286 \\
+Aadhav & 6.750000 \\
+Aadhini & 6.000000 \\
+\end{longtable}
+
+\texttt{pandas} also offers a number of in-built functions. Functions
+that are native to \texttt{pandas} can be referenced using their string
+name within a call to \texttt{.agg}. Some examples include:
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{.agg("sum")}
+\item
+  \texttt{.agg("max")}
+\item
+  \texttt{.agg("min")}
+\item
+  \texttt{.agg("mean")}
+\item
+  \texttt{.agg("first")}
+\item
+  \texttt{.agg("last")}
+\end{itemize}
+
+The latter two entries in this list -- \texttt{"first"} and
+\texttt{"last"} -- are unique to \texttt{pandas}. They return the first
+or last entry in a subframe column. Why might this be useful? Consider a
+case where \emph{multiple} columns in a group share identical
+information. To represent this information in the grouped output, we can
+simply grab the first or last entry, which we know will be identical to
+all other entries.
+
+Let's illustrate this with an example. Say we add a new column to
+\texttt{babynames} that contains the first letter of each name.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Imagine we had an additional column, "First Letter". We\textquotesingle{}ll explain this code next week}
+\NormalTok{babynames[}\StringTok{"First Letter"}\NormalTok{] }\OperatorTok{=}\NormalTok{ babynames[}\StringTok{"Name"}\NormalTok{].}\BuiltInTok{str}\NormalTok{[}\DecValTok{0}\NormalTok{]}
+
+\CommentTok{\# We construct a simplified DataFrame containing just a subset of columns}
+\NormalTok{babynames\_new }\OperatorTok{=}\NormalTok{ babynames[[}\StringTok{"Name"}\NormalTok{, }\StringTok{"First Letter"}\NormalTok{, }\StringTok{"Year"}\NormalTok{]]}
+\NormalTok{babynames\_new.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& Name & First Letter & Year \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+115957 & Deandrea & D & 1990 \\
+101976 & Deandrea & D & 1986 \\
+131029 & Leandrea & L & 1994 \\
+108731 & Deandrea & D & 1988 \\
+308131 & Deandrea & D & 1985 \\
+\end{longtable}
+
+If we form groups for each name in the dataset, \texttt{"First\ Letter"}
+will be the same for all members of the group. This means that if we
+simply select the first entry for \texttt{"First\ Letter"} in the group,
+we'll represent all data in that group.
+
+We can use a dictionary to apply different aggregation functions to each
+column during grouping.
+
+\begin{figure}[H]
+
+{\centering \includegraphics{pandas_3/images/first.png}
+
+}
+
+\caption{Aggregating using ``first''}
+
+\end{figure}%
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames\_new.groupby(}\StringTok{"Name"}\NormalTok{).agg(\{}\StringTok{"First Letter"}\NormalTok{:}\StringTok{"first"}\NormalTok{, }\StringTok{"Year"}\NormalTok{:}\StringTok{"max"}\NormalTok{\}).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& First Letter & Year \\
+Name & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Aadan & A & 2014 \\
+Aadarsh & A & 2019 \\
+Aaden & A & 2020 \\
+Aadhav & A & 2019 \\
+Aadhini & A & 2022 \\
+\end{longtable}
+
+\subsection{Plotting Birth Counts}\label{plotting-birth-counts}
+
+Let's use \texttt{.agg} to find the total number of babies born in each
+year. Recall that using \texttt{.agg} with \texttt{.groupby()} follows
+the format:
+\texttt{df.groupby(column\_name).agg(aggregation\_function)}. The line
+of code below gives us the total number of babies born in each year.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames.groupby(}\StringTok{"Year"}\NormalTok{)[[}\StringTok{"Count"}\NormalTok{]].agg(}\BuiltInTok{sum}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
+\CommentTok{\# Alternative 1}
+\CommentTok{\# babynames.groupby("Year")[["Count"]].sum()}
+\CommentTok{\# Alternative 2}
+\CommentTok{\# babynames.groupby("Year").sum(numeric\_only=True)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/390646742.py:1: FutureWarning:
+
+The provided callable <built-in function sum> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Count \\
+Year & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1910 & 9163 \\
+1911 & 9983 \\
+1912 & 17946 \\
+1913 & 22094 \\
+1914 & 26926 \\
+\end{longtable}
+
+Here's an illustration of the process:
+
+Plotting the \texttt{Dataframe} we obtain tells an interesting story.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ plotly.express }\ImportTok{as}\NormalTok{ px}
+\NormalTok{puzzle2 }\OperatorTok{=}\NormalTok{ babynames.groupby(}\StringTok{"Year"}\NormalTok{)[[}\StringTok{"Count"}\NormalTok{]].agg(}\BuiltInTok{sum}\NormalTok{)}
+\NormalTok{px.line(puzzle2, y }\OperatorTok{=} \StringTok{"Count"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/4066413905.py:2: FutureWarning:
+
+The provided callable <built-in function sum> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+\end{verbatim}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+\textbf{A word of warning}: we made an enormous assumption when we
+decided to use this dataset to estimate birth rate. According to
+\href{https://lao.ca.gov/LAOEconTax/Article/Detail/691}{this article
+from the Legistlative Analyst Office}, the true number of babies born in
+California in 2020 was 421,275. However, our plot shows 362,882 babies
+------ what happened?
+
+\subsection{\texorpdfstring{Summary of the \texttt{.groupby()}
+Function}{Summary of the .groupby() Function}}\label{summary-of-the-.groupby-function}
+
+A \texttt{groupby} operation involves some combination of
+\textbf{splitting a \texttt{DataFrame} into grouped subframes},
+\textbf{applying a function}, and \textbf{combining the results}.
+
+For some arbitrary \texttt{DataFrame} \texttt{df} below, the code
+\texttt{df.groupby("year").agg(sum)} does the following:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Splits} the \texttt{DataFrame} into sub-\texttt{DataFrame}s
+  with rows belonging to the same year.
+\item
+  \textbf{Applies} the \texttt{sum} function to each column of each
+  sub-\texttt{DataFrame}.
+\item
+  \textbf{Combines} the results of \texttt{sum} into a single
+  \texttt{DataFrame}, indexed by \texttt{year}.
+\end{itemize}
+
+\subsection{\texorpdfstring{Revisiting the \texttt{.agg()}
+Function}{Revisiting the .agg() Function}}\label{revisiting-the-.agg-function}
+
+\texttt{.agg()} can take in any function that aggregates several values
+into one summary value. Some commonly-used aggregation functions can
+even be called directly, without explicit use of \texttt{.agg()}. For
+example, we can call \texttt{.mean()} on \texttt{.groupby()}:
+
+\begin{verbatim}
+babynames.groupby("Year").mean().head()
+\end{verbatim}
+
+We can now put this all into practice. Say we want to find the baby name
+with sex ``F'' that has fallen in popularity the most in California. To
+calculate this, we can first create a metric: ``Ratio to Peak'' (RTP).
+The RTP is the ratio of babies born with a given name in 2022 to the
+\emph{maximum} number of babies born with the name in \emph{any} year.
+
+Let's start with calculating this for one baby, ``Jennifer''.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# We filter by babies with sex "F" and sort by "Year"}
+\NormalTok{f\_babynames }\OperatorTok{=}\NormalTok{ babynames[babynames[}\StringTok{"Sex"}\NormalTok{] }\OperatorTok{==} \StringTok{"F"}\NormalTok{]}
+\NormalTok{f\_babynames }\OperatorTok{=}\NormalTok{ f\_babynames.sort\_values([}\StringTok{"Year"}\NormalTok{])}
+
+\CommentTok{\# Determine how many Jennifers were born in CA per year}
+\NormalTok{jenn\_counts\_series }\OperatorTok{=}\NormalTok{ f\_babynames[f\_babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Jennifer"}\NormalTok{][}\StringTok{"Count"}\NormalTok{]}
+
+\CommentTok{\# Determine the max number of Jennifers born in a year and the number born in 2022 }
+\CommentTok{\# to calculate RTP}
+\NormalTok{max\_jenn }\OperatorTok{=} \BuiltInTok{max}\NormalTok{(f\_babynames[f\_babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Jennifer"}\NormalTok{][}\StringTok{"Count"}\NormalTok{])}
+\NormalTok{curr\_jenn }\OperatorTok{=}\NormalTok{ f\_babynames[f\_babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Jennifer"}\NormalTok{][}\StringTok{"Count"}\NormalTok{].iloc[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}
+\NormalTok{rtp }\OperatorTok{=}\NormalTok{ curr\_jenn }\OperatorTok{/}\NormalTok{ max\_jenn}
+\NormalTok{rtp}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(0.018796372629843364)
+\end{verbatim}
+
+By creating a function to calculate RTP and applying it to our
+\texttt{DataFrame} by using \texttt{.groupby()}, we can easily compute
+the RTP for all names at once!
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ ratio\_to\_peak(series):}
+    \ControlFlowTok{return}\NormalTok{ series.iloc[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{] }\OperatorTok{/} \BuiltInTok{max}\NormalTok{(series)}
+
+\CommentTok{\#Using .groupby() to apply the function}
+\NormalTok{rtp\_table }\OperatorTok{=}\NormalTok{ f\_babynames.groupby(}\StringTok{"Name"}\NormalTok{)[[}\StringTok{"Year"}\NormalTok{, }\StringTok{"Count"}\NormalTok{]].agg(ratio\_to\_peak)}
+\NormalTok{rtp\_table.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Year & Count \\
+Name & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Aadhini & 1.0 & 1.000000 \\
+Aadhira & 1.0 & 0.500000 \\
+Aadhya & 1.0 & 0.660000 \\
+Aadya & 1.0 & 0.586207 \\
+Aahana & 1.0 & 0.269231 \\
+\end{longtable}
+
+In the rows shown above, we can see that every row shown has a
+\texttt{Year} value of \texttt{1.0}.
+
+This is the ``\textbf{\texttt{pandas}}-ification'' of logic you saw in
+Data 8. Much of the logic you've learned in Data 8 will serve you well
+in Data 100.
+
+\subsection{Nuisance Columns}\label{nuisance-columns}
+
+Note that you must be careful with which columns you apply the
+\texttt{.agg()} function to. If we were to apply our function to the
+table as a whole by doing
+\texttt{f\_babynames.groupby("Name").agg(ratio\_to\_peak)}, executing
+our \texttt{.agg()} call would result in a \texttt{TypeError}.
+
+We can avoid this issue (and prevent unintentional loss of data) by
+explicitly selecting column(s) we want to apply our aggregation function
+to \textbf{BEFORE} calling \texttt{.agg()},
+
+\subsection{Renaming Columns After
+Grouping}\label{renaming-columns-after-grouping}
+
+By default, \texttt{.groupby} will not rename any aggregated columns. As
+we can see in the table above, the aggregated column is still named
+\texttt{Count} even though it now represents the RTP. For better
+readability, we can rename \texttt{Count} to \texttt{Count\ RTP}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{rtp\_table }\OperatorTok{=}\NormalTok{ rtp\_table.rename(columns }\OperatorTok{=}\NormalTok{ \{}\StringTok{"Count"}\NormalTok{: }\StringTok{"Count RTP"}\NormalTok{\})}
+\NormalTok{rtp\_table}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Year & Count RTP \\
+Name & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Aadhini & 1.0 & 1.000000 \\
+Aadhira & 1.0 & 0.500000 \\
+Aadhya & 1.0 & 0.660000 \\
+Aadya & 1.0 & 0.586207 \\
+Aahana & 1.0 & 0.269231 \\
+... & ... & ... \\
+Zyanya & 1.0 & 0.466667 \\
+Zyla & 1.0 & 1.000000 \\
+Zylah & 1.0 & 1.000000 \\
+Zyra & 1.0 & 1.000000 \\
+Zyrah & 1.0 & 0.833333 \\
+\end{longtable}
+
+\subsection{Some Data Science Payoff}\label{some-data-science-payoff}
+
+By sorting \texttt{rtp\_table}, we can see the names whose popularity
+has decreased the most.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{rtp\_table }\OperatorTok{=}\NormalTok{ rtp\_table.rename(columns }\OperatorTok{=}\NormalTok{ \{}\StringTok{"Count"}\NormalTok{: }\StringTok{"Count RTP"}\NormalTok{\})}
+\NormalTok{rtp\_table.sort\_values(}\StringTok{"Count RTP"}\NormalTok{).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Year & Count RTP \\
+Name & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Debra & 1.0 & 0.001260 \\
+Debbie & 1.0 & 0.002815 \\
+Carol & 1.0 & 0.003180 \\
+Tammy & 1.0 & 0.003249 \\
+Susan & 1.0 & 0.003305 \\
+\end{longtable}
+
+To visualize the above \texttt{DataFrame}, let's look at the line plot
+below:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ plotly.express }\ImportTok{as}\NormalTok{ px}
+\NormalTok{px.line(f\_babynames[f\_babynames[}\StringTok{"Name"}\NormalTok{] }\OperatorTok{==} \StringTok{"Debra"}\NormalTok{], x }\OperatorTok{=} \StringTok{"Year"}\NormalTok{, y }\OperatorTok{=} \StringTok{"Count"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+We can get the list of the top 10 names and then plot popularity with
+the following code:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{top10 }\OperatorTok{=}\NormalTok{ rtp\_table.sort\_values(}\StringTok{"Count RTP"}\NormalTok{).head(}\DecValTok{10}\NormalTok{).index}
+\NormalTok{px.line(}
+\NormalTok{    f\_babynames[f\_babynames[}\StringTok{"Name"}\NormalTok{].isin(top10)], }
+\NormalTok{    x }\OperatorTok{=} \StringTok{"Year"}\NormalTok{, }
+\NormalTok{    y }\OperatorTok{=} \StringTok{"Count"}\NormalTok{, }
+\NormalTok{    color }\OperatorTok{=} \StringTok{"Name"}
+\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+As a quick exercise, consider what code would compute the total number
+of babies with each name.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames.groupby(}\StringTok{"Name"}\NormalTok{)[[}\StringTok{"Count"}\NormalTok{]].agg(}\BuiltInTok{sum}\NormalTok{).head()}
+\CommentTok{\# alternative solution: }
+\CommentTok{\# babynames.groupby("Name")[["Count"]].sum()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/1912269730.py:1: FutureWarning:
+
+The provided callable <built-in function sum> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& Count \\
+Name & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Aadan & 18 \\
+Aadarsh & 6 \\
+Aaden & 647 \\
+Aadhav & 27 \\
+Aadhini & 6 \\
+\end{longtable}
+
+\section{\texorpdfstring{\texttt{.groupby()},
+Continued}{.groupby(), Continued}}\label{groupby-continued}
+
+We'll work with the \texttt{elections} \texttt{DataFrame} again.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+
+\NormalTok{elections }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/elections.csv"}\NormalTok{)}
+\NormalTok{elections.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.210122 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.789878 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.203927 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.796073 \\
+4 & 1832 & Andrew Jackson & Democratic & 702735 & win & 54.574789 \\
+\end{longtable}
+
+\subsection{\texorpdfstring{Raw \texttt{GroupBy}
+Objects}{Raw GroupBy Objects}}\label{raw-groupby-objects}
+
+The result of \texttt{groupby} applied to a \texttt{DataFrame} is a
+\texttt{DataFrameGroupBy} object, \textbf{not} a \texttt{DataFrame}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{grouped\_by\_year }\OperatorTok{=}\NormalTok{ elections.groupby(}\StringTok{"Year"}\NormalTok{)}
+\BuiltInTok{type}\NormalTok{(grouped\_by\_year)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+pandas.core.groupby.generic.DataFrameGroupBy
+\end{verbatim}
+
+There are several ways to look into \texttt{DataFrameGroupBy} objects:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{grouped\_by\_party }\OperatorTok{=}\NormalTok{ elections.groupby(}\StringTok{"Party"}\NormalTok{)}
+\NormalTok{grouped\_by\_party.groups}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+{'American': [22, 126], 'American Independent': [115, 119, 124], 'Anti-Masonic': [6], 'Anti-Monopoly': [38], 'Citizens': [127], 'Communist': [89], 'Constitution': [160, 164, 172], 'Constitutional Union': [24], 'Democratic': [2, 4, 8, 10, 13, 14, 17, 20, 28, 29, 34, 37, 39, 45, 47, 52, 55, 57, 64, 70, 74, 77, 81, 83, 86, 91, 94, 97, 100, 105, 108, 111, 114, 116, 118, 123, 129, 134, 137, 140, 144, 151, 158, 162, 168, 176, 178], 'Democratic-Republican': [0, 1], 'Dixiecrat': [103], 'Farmer–Labor': [78], 'Free Soil': [15, 18], 'Green': [149, 155, 156, 165, 170, 177, 181], 'Greenback': [35], 'Independent': [121, 130, 143, 161, 167, 174], 'Liberal Republican': [31], 'Libertarian': [125, 128, 132, 138, 139, 146, 153, 159, 163, 169, 175, 180], 'National Democratic': [50], 'National Republican': [3, 5], 'National Union': [27], 'Natural Law': [148], 'New Alliance': [136], 'Northern Democratic': [26], 'Populist': [48, 61, 141], 'Progressive': [68, 82, 101, 107], 'Prohibition': [41, 44, 49, 51, 54, 59, 63, 67, 73, 75, 99], 'Reform': [150, 154], 'Republican': [21, 23, 30, 32, 33, 36, 40, 43, 46, 53, 56, 60, 65, 69, 72, 79, 80, 84, 87, 90, 96, 98, 104, 106, 109, 112, 113, 117, 120, 122, 131, 133, 135, 142, 145, 152, 157, 166, 171, 173, 179], 'Socialist': [58, 62, 66, 71, 76, 85, 88, 92, 95, 102], 'Southern Democratic': [25], 'States' Rights': [110], 'Taxpayers': [147], 'Union': [93], 'Union Labor': [42], 'Whig': [7, 9, 11, 12, 16, 19]}
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{grouped\_by\_party.get\_group(}\StringTok{"Socialist"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+58 & 1904 & Eugene V. Debs & Socialist & 402810 & loss & 2.985897 \\
+62 & 1908 & Eugene V. Debs & Socialist & 420852 & loss & 2.850866 \\
+66 & 1912 & Eugene V. Debs & Socialist & 901551 & loss & 6.004354 \\
+71 & 1916 & Allan L. Benson & Socialist & 590524 & loss & 3.194193 \\
+76 & 1920 & Eugene V. Debs & Socialist & 913693 & loss & 3.428282 \\
+85 & 1928 & Norman Thomas & Socialist & 267478 & loss & 0.728623 \\
+88 & 1932 & Norman Thomas & Socialist & 884885 & loss & 2.236211 \\
+92 & 1936 & Norman Thomas & Socialist & 187910 & loss & 0.412876 \\
+95 & 1940 & Norman Thomas & Socialist & 116599 & loss & 0.234237 \\
+102 & 1948 & Norman Thomas & Socialist & 139569 & loss & 0.286312 \\
+\end{longtable}
+
+\subsection{\texorpdfstring{Other \texttt{GroupBy}
+Methods}{Other GroupBy Methods}}\label{other-groupby-methods}
+
+There are many aggregation methods we can use with \texttt{.agg}. Some
+useful options are:
+
+\begin{itemize}
+\tightlist
+\item
+  \href{https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.mean.html\#pandas.core.groupby.DataFrameGroupBy.mean}{\texttt{.mean}}:
+  creates a new \texttt{DataFrame} with the mean value of each group
+\item
+  \href{https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.sum.html\#pandas.core.groupby.DataFrameGroupBy.sum}{\texttt{.sum}}:
+  creates a new \texttt{DataFrame} with the sum of each group
+\item
+  \href{https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.max.html\#pandas.core.groupby.DataFrameGroupBy.max}{\texttt{.max}}
+  and
+  \href{https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.min.html\#pandas.core.groupby.DataFrameGroupBy.min}{\texttt{.min}}:
+  creates a new \texttt{DataFrame} with the maximum/minimum value of
+  each group
+\item
+  \href{https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.first.html\#pandas.core.groupby.DataFrameGroupBy.first}{\texttt{.first}}
+  and
+  \href{https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.last.html\#pandas.core.groupby.DataFrameGroupBy.last}{\texttt{.last}}:
+  creates a new \texttt{DataFrame} with the first/last row in each group
+\item
+  \href{https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.size.html\#pandas.core.groupby.DataFrameGroupBy.size}{\texttt{.size}}:
+  creates a new \textbf{\texttt{Series}} with the number of entries in
+  each group
+\item
+  \href{https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.count.html\#pandas.core.groupby.DataFrameGroupBy.count}{\texttt{.count}}:
+  creates a new \textbf{\texttt{DataFrame}} with the number of entries,
+  excluding missing values.
+\end{itemize}
+
+Let's illustrate some examples by creating a \texttt{DataFrame} called
+\texttt{df}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{df }\OperatorTok{=}\NormalTok{ pd.DataFrame(\{}\StringTok{\textquotesingle{}letter\textquotesingle{}}\NormalTok{:[}\StringTok{\textquotesingle{}A\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}A\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}B\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}C\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}C\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}C\textquotesingle{}}\NormalTok{], }
+                   \StringTok{\textquotesingle{}num\textquotesingle{}}\NormalTok{:[}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{3}\NormalTok{,}\DecValTok{4}\NormalTok{,np.nan,}\DecValTok{4}\NormalTok{], }
+                   \StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{:[np.nan, }\StringTok{\textquotesingle{}tx\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}fl\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}hi\textquotesingle{}}\NormalTok{, np.nan, }\StringTok{\textquotesingle{}ak\textquotesingle{}}\NormalTok{]\})}
+\NormalTok{df}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& letter & num & state \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & A & 1.0 & NaN \\
+1 & A & 2.0 & tx \\
+2 & B & 3.0 & fl \\
+3 & C & 4.0 & hi \\
+4 & C & NaN & NaN \\
+5 & C & 4.0 & ak \\
+\end{longtable}
+
+Note the slight difference between \texttt{.size()} and
+\texttt{.count()}: while \texttt{.size()} returns a \texttt{Series} and
+counts the number of entries including the missing values,
+\texttt{.count()} returns a \texttt{DataFrame} and counts the number of
+entries in each column \emph{excluding missing values}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{df.groupby(}\StringTok{"letter"}\NormalTok{).size()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+letter
+A    2
+B    1
+C    3
+dtype: int64
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{df.groupby(}\StringTok{"letter"}\NormalTok{).count()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& num & state \\
+letter & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+A & 2 & 1 \\
+B & 1 & 1 \\
+C & 2 & 2 \\
+\end{longtable}
+
+You might recall that the \texttt{value\_counts()} function in the
+previous note does something similar. It turns out
+\texttt{value\_counts()} and \texttt{groupby.size()} are the same,
+except \texttt{value\_counts()} sorts the resulting \texttt{Series} in
+descending order automatically.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{df[}\StringTok{"letter"}\NormalTok{].value\_counts()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+letter
+C    3
+A    2
+B    1
+Name: count, dtype: int64
+\end{verbatim}
+
+These (and other) aggregation functions are so common that
+\texttt{pandas} allows for writing shorthand. Instead of explicitly
+stating the use of \texttt{.agg}, we can call the function directly on
+the \texttt{GroupBy} object.
+
+For example, the following are equivalent:
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{elections.groupby("Candidate").agg(mean)}
+\item
+  \texttt{elections.groupby("Candidate").mean()}
+\end{itemize}
+
+There are many other methods that \texttt{pandas} supports. You can
+check them out on the
+\href{https://pandas.pydata.org/docs/reference/groupby.html}{\texttt{pandas}
+documentation}.
+
+\subsection{Filtering by Group}\label{filtering-by-group}
+
+Another common use for \texttt{GroupBy} objects is to filter data by
+group.
+
+\texttt{groupby.filter} takes an argument \texttt{func}, where
+\texttt{func} is a function that:
+
+\begin{itemize}
+\tightlist
+\item
+  Takes a \texttt{DataFrame} object as input
+\item
+  Returns a single \texttt{True} or \texttt{False}.
+\end{itemize}
+
+\texttt{groupby.filter} applies \texttt{func} to each
+group/sub-\texttt{DataFrame}:
+
+\begin{itemize}
+\tightlist
+\item
+  If \texttt{func} returns \texttt{True} for a group, then all rows
+  belonging to the group are preserved.
+\item
+  If \texttt{func} returns \texttt{False} for a group, then all rows
+  belonging to that group are filtered out.
+\end{itemize}
+
+In other words, sub-\texttt{DataFrame}s that correspond to \texttt{True}
+are returned in the final result, whereas those with a \texttt{False}
+value are not. Importantly, \texttt{groupby.filter} is different from
+\texttt{groupby.agg} in that an \emph{entire} sub-\texttt{DataFrame} is
+returned in the final \texttt{DataFrame}, not just a single row. As a
+result, \texttt{groupby.filter} preserves the original indices and the
+column we grouped on does \textbf{NOT} become the index!
+
+To illustrate how this happens, let's go back to the \texttt{elections}
+dataset. Say we want to identify ``tight'' election years -- that is, we
+want to find all rows that correspond to election years where all
+candidates in that year won a similar portion of the total vote.
+Specifically, let's find all rows corresponding to a year where no
+candidate won more than 45\% of the total vote.
+
+In other words, we want to:
+
+\begin{itemize}
+\tightlist
+\item
+  Find the years where the maximum \texttt{\%} in that year is less than
+  45\%
+\item
+  Return all \texttt{DataFrame} rows that correspond to these years
+\end{itemize}
+
+For each year, we need to find the maximum \texttt{\%} among \emph{all}
+rows for that year. If this maximum \texttt{\%} is lower than 45\%, we
+will tell \texttt{pandas} to keep all rows corresponding to that year.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.groupby(}\StringTok{"Year"}\NormalTok{).}\BuiltInTok{filter}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ sf: sf[}\StringTok{"\%"}\NormalTok{].}\BuiltInTok{max}\NormalTok{() }\OperatorTok{\textless{}} \DecValTok{45}\NormalTok{).head(}\DecValTok{9}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+23 & 1860 & Abraham Lincoln & Republican & 1855993 & win & 39.699408 \\
+24 & 1860 & John Bell & Constitutional Union & 590901 & loss &
+12.639283 \\
+25 & 1860 & John C. Breckinridge & Southern Democratic & 848019 & loss &
+18.138998 \\
+26 & 1860 & Stephen A. Douglas & Northern Democratic & 1380202 & loss &
+29.522311 \\
+66 & 1912 & Eugene V. Debs & Socialist & 901551 & loss & 6.004354 \\
+67 & 1912 & Eugene W. Chafin & Prohibition & 208156 & loss & 1.386325 \\
+68 & 1912 & Theodore Roosevelt & Progressive & 4122721 & loss &
+27.457433 \\
+69 & 1912 & William Taft & Republican & 3486242 & loss & 23.218466 \\
+70 & 1912 & Woodrow Wilson & Democratic & 6296284 & win & 41.933422 \\
+\end{longtable}
+
+What's going on here? In this example, we've defined our filtering
+function, \texttt{func}, to be
+\texttt{lambda\ sf:\ sf{[}"\%"{]}.max()\ \textless{}\ 45}. This
+filtering function will find the maximum \texttt{"\%"} value among all
+entries in the grouped sub-\texttt{DataFrame}, which we call
+\texttt{sf}. If the maximum value is less than 45, then the filter
+function will return \texttt{True} and all rows in that grouped
+sub-\texttt{DataFrame} will appear in the final output
+\texttt{DataFrame}.
+
+Examine the \texttt{DataFrame} above. Notice how, in this preview of the
+first 9 rows, all entries from the years 1860 and 1912 appear. This
+means that in 1860 and 1912, no candidate in that year won more than
+45\% of the total vote.
+
+You may ask: how is the \texttt{groupby.filter} procedure different to
+the boolean filtering we've seen previously? Boolean filtering considers
+\emph{individual} rows when applying a boolean condition. For example,
+the code \texttt{elections{[}elections{[}"\%"{]}\ \textless{}\ 45{]}}
+will check the \texttt{"\%"} value of every single row in
+\texttt{elections}; if it is less than 45, then that row will be kept in
+the output. \texttt{groupby.filter}, in contrast, applies a boolean
+condition \emph{across} all rows in a group. If not all rows in that
+group satisfy the condition specified by the filter, the entire group
+will be discarded in the output.
+
+\subsection{\texorpdfstring{Aggregation with \texttt{lambda}
+Functions}{Aggregation with lambda Functions}}\label{aggregation-with-lambda-functions}
+
+What if we wish to aggregate our \texttt{DataFrame} using a non-standard
+function -- for example, a function of our own design? We can do so by
+combining \texttt{.agg} with \texttt{lambda} expressions.
+
+Let's first consider a puzzle to jog our memory. We will attempt to find
+the \texttt{Candidate} from each \texttt{Party} with the highest
+\texttt{\%} of votes.
+
+A naive approach may be to group by the \texttt{Party} column and
+aggregate by the maximum.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.groupby(}\StringTok{"Party"}\NormalTok{).agg(}\BuiltInTok{max}\NormalTok{).head(}\DecValTok{10}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/4278286395.py:1: FutureWarning:
+
+The provided callable <built-in function max> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Popular vote & Result & \% \\
+Party & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+American & 1976 & Thomas J. Anderson & 873053 & loss & 21.554001 \\
+American Independent & 1976 & Lester Maddox & 9901118 & loss &
+13.571218 \\
+Anti-Masonic & 1832 & William Wirt & 100715 & loss & 7.821583 \\
+Anti-Monopoly & 1884 & Benjamin Butler & 134294 & loss & 1.335838 \\
+Citizens & 1980 & Barry Commoner & 233052 & loss & 0.270182 \\
+Communist & 1932 & William Z. Foster & 103307 & loss & 0.261069 \\
+Constitution & 2016 & Michael Peroutka & 203091 & loss & 0.152398 \\
+Constitutional Union & 1860 & John Bell & 590901 & loss & 12.639283 \\
+Democratic & 2020 & Woodrow Wilson & 81268924 & win & 61.344703 \\
+Democratic-Republican & 1824 & John Quincy Adams & 151271 & win &
+57.210122 \\
+\end{longtable}
+
+This approach is clearly wrong -- the \texttt{DataFrame} claims that
+Woodrow Wilson won the presidency in 2020.
+
+Why is this happening? Here, the \texttt{max} aggregation function is
+taken over every column \emph{independently}. Among Democrats,
+\texttt{max} is computing:
+
+\begin{itemize}
+\tightlist
+\item
+  The most recent \texttt{Year} a Democratic candidate ran for president
+  (2020)
+\item
+  The \texttt{Candidate} with the alphabetically ``largest'' name
+  (``Woodrow Wilson'')
+\item
+  The \texttt{Result} with the alphabetically ``largest'' outcome
+  (``win'')
+\end{itemize}
+
+Instead, let's try a different approach. We will:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Sort the \texttt{DataFrame} so that rows are in descending order of
+  \texttt{\%}
+\item
+  Group by \texttt{Party} and select the first row of each
+  sub-\texttt{DataFrame}
+\end{enumerate}
+
+While it may seem unintuitive, sorting \texttt{elections} by descending
+order of \texttt{\%} is extremely helpful. If we then group by
+\texttt{Party}, the first row of each \texttt{GroupBy} object will
+contain information about the \texttt{Candidate} with the highest voter
+\texttt{\%}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections\_sorted\_by\_percent }\OperatorTok{=}\NormalTok{ elections.sort\_values(}\StringTok{"\%"}\NormalTok{, ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+\NormalTok{elections\_sorted\_by\_percent.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+114 & 1964 & Lyndon Johnson & Democratic & 43127041 & win & 61.344703 \\
+91 & 1936 & Franklin Roosevelt & Democratic & 27752648 & win &
+60.978107 \\
+120 & 1972 & Richard Nixon & Republican & 47168710 & win & 60.907806 \\
+79 & 1920 & Warren Harding & Republican & 16144093 & win & 60.574501 \\
+133 & 1984 & Ronald Reagan & Republican & 54455472 & win & 59.023326 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections\_sorted\_by\_percent.groupby(}\StringTok{"Party"}\NormalTok{).agg(}\KeywordTok{lambda}\NormalTok{ x : x.iloc[}\DecValTok{0}\NormalTok{]).head(}\DecValTok{10}\NormalTok{)}
+
+\CommentTok{\# Equivalent to the below code}
+\CommentTok{\# elections\_sorted\_by\_percent.groupby("Party").agg(\textquotesingle{}first\textquotesingle{}).head(10)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Popular vote & Result & \% \\
+Party & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+American & 1856 & Millard Fillmore & 873053 & loss & 21.554001 \\
+American Independent & 1968 & George Wallace & 9901118 & loss &
+13.571218 \\
+Anti-Masonic & 1832 & William Wirt & 100715 & loss & 7.821583 \\
+Anti-Monopoly & 1884 & Benjamin Butler & 134294 & loss & 1.335838 \\
+Citizens & 1980 & Barry Commoner & 233052 & loss & 0.270182 \\
+Communist & 1932 & William Z. Foster & 103307 & loss & 0.261069 \\
+Constitution & 2008 & Chuck Baldwin & 199750 & loss & 0.152398 \\
+Constitutional Union & 1860 & John Bell & 590901 & loss & 12.639283 \\
+Democratic & 1964 & Lyndon Johnson & 43127041 & win & 61.344703 \\
+Democratic-Republican & 1824 & Andrew Jackson & 151271 & loss &
+57.210122 \\
+\end{longtable}
+
+Here's an illustration of the process:
+
+Notice how our code correctly determines that Lyndon Johnson from the
+Democratic Party has the highest voter \texttt{\%}.
+
+More generally, \texttt{lambda} functions are used to design custom
+aggregation functions that aren't pre-defined by Python. The input
+parameter \texttt{x} to the \texttt{lambda} function is a
+\texttt{GroupBy} object. Therefore, it should make sense why
+\texttt{lambda\ x\ :\ x.iloc{[}0{]}} selects the first row in each
+groupby object.
+
+In fact, there's a few different ways to approach this problem. Each
+approach has different tradeoffs in terms of readability, performance,
+memory consumption, complexity, etc. We've given a few examples below.
+
+\textbf{Note}: Understanding these alternative solutions is not
+required. They are given to demonstrate the vast number of
+problem-solving approaches in \texttt{pandas}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Using the idxmax function}
+\NormalTok{best\_per\_party }\OperatorTok{=}\NormalTok{ elections.loc[elections.groupby(}\StringTok{\textquotesingle{}Party\textquotesingle{}}\NormalTok{)[}\StringTok{\textquotesingle{}\%\textquotesingle{}}\NormalTok{].idxmax()]}
+\NormalTok{best\_per\_party.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+22 & 1856 & Millard Fillmore & American & 873053 & loss & 21.554001 \\
+115 & 1968 & George Wallace & American Independent & 9901118 & loss &
+13.571218 \\
+6 & 1832 & William Wirt & Anti-Masonic & 100715 & loss & 7.821583 \\
+38 & 1884 & Benjamin Butler & Anti-Monopoly & 134294 & loss &
+1.335838 \\
+127 & 1980 & Barry Commoner & Citizens & 233052 & loss & 0.270182 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Using the .drop\_duplicates function}
+\NormalTok{best\_per\_party2 }\OperatorTok{=}\NormalTok{ elections.sort\_values(}\StringTok{\textquotesingle{}\%\textquotesingle{}}\NormalTok{).drop\_duplicates([}\StringTok{\textquotesingle{}Party\textquotesingle{}}\NormalTok{], keep}\OperatorTok{=}\StringTok{\textquotesingle{}last\textquotesingle{}}\NormalTok{)}
+\NormalTok{best\_per\_party2.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+148 & 1996 & John Hagelin & Natural Law & 113670 & loss & 0.118219 \\
+164 & 2008 & Chuck Baldwin & Constitution & 199750 & loss & 0.152398 \\
+110 & 1956 & T. Coleman Andrews & States\textquotesingle{} Rights &
+107929 & loss & 0.174883 \\
+147 & 1996 & Howard Phillips & Taxpayers & 184656 & loss & 0.192045 \\
+136 & 1988 & Lenora Fulani & New Alliance & 217221 & loss & 0.237804 \\
+\end{longtable}
+
+\section{Aggregating Data with Pivot
+Tables}\label{aggregating-data-with-pivot-tables}
+
+We know now that \texttt{.groupby} gives us the ability to group and
+aggregate data across our \texttt{DataFrame}. The examples above formed
+groups using just one column in the \texttt{DataFrame}. It's possible to
+group by multiple columns at once by passing in a list of column names
+to \texttt{.groupby}.
+
+Let's consider the \texttt{babynames} dataset again. In this problem, we
+will find the total number of baby names associated with each sex for
+each year. To do this, we'll group by \emph{both} the \texttt{"Year"}
+and \texttt{"Sex"} columns.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count & First Letter \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+115957 & CA & F & 1990 & Deandrea & 5 & D \\
+101976 & CA & F & 1986 & Deandrea & 6 & D \\
+131029 & CA & F & 1994 & Leandrea & 5 & L \\
+108731 & CA & F & 1988 & Deandrea & 5 & D \\
+308131 & CA & M & 1985 & Deandrea & 6 & D \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Find the total number of baby names associated with each sex for each }
+\CommentTok{\# year in the data}
+\NormalTok{babynames.groupby([}\StringTok{"Year"}\NormalTok{, }\StringTok{"Sex"}\NormalTok{])[[}\StringTok{"Count"}\NormalTok{]].agg(}\BuiltInTok{sum}\NormalTok{).head(}\DecValTok{6}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/3186035650.py:3: FutureWarning:
+
+The provided callable <built-in function sum> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& & Count \\
+Year & Sex & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\multirow{2}{=}{1910} & F & 5950 \\
+& M & 3213 \\
+\multirow{2}{=}{1911} & F & 6602 \\
+& M & 3381 \\
+\multirow{2}{=}{1912} & F & 9804 \\
+& M & 8142 \\
+\end{longtable}
+
+Notice that both \texttt{"Year"} and \texttt{"Sex"} serve as the index
+of the \texttt{DataFrame} (they are both rendered in bold). We've
+created a \emph{multi-index} \texttt{DataFrame} where two different
+index values, the year and sex, are used to uniquely identify each row.
+
+This isn't the most intuitive way of representing this data -- and,
+because multi-indexed DataFrames have multiple dimensions in their
+index, they can often be difficult to use.
+
+Another strategy to aggregate across two columns is to create a pivot
+table. You saw these back in
+\href{https://inferentialthinking.com/chapters/08/3/Cross-Classifying_by_More_than_One_Variable.html\#pivot-tables-rearranging-the-output-of-group}{Data
+8}. One set of values is used to create the index of the pivot table;
+another set is used to define the column names. The values contained in
+each cell of the table correspond to the aggregated data for each
+index-column pair.
+
+Here's an illustration of the process:
+
+The best way to understand pivot tables is to see one in action. Let's
+return to our original goal of summing the total number of names
+associated with each combination of year and sex. We'll call the
+\texttt{pandas}
+\href{https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html}{\texttt{.pivot\_table}}
+method to create a new table.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# The \textasciigrave{}pivot\_table\textasciigrave{} method is used to generate a Pandas pivot table}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\NormalTok{babynames.pivot\_table(}
+\NormalTok{    index }\OperatorTok{=} \StringTok{"Year"}\NormalTok{,}
+\NormalTok{    columns }\OperatorTok{=} \StringTok{"Sex"}\NormalTok{,    }
+\NormalTok{    values }\OperatorTok{=} \StringTok{"Count"}\NormalTok{, }
+\NormalTok{    aggfunc }\OperatorTok{=}\NormalTok{ np.}\BuiltInTok{sum}\NormalTok{, }
+\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/2548053048.py:3: FutureWarning:
+
+The provided callable <function sum at 0x103c74160> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+Sex & F & M \\
+Year & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1910 & 5950 & 3213 \\
+1911 & 6602 & 3381 \\
+1912 & 9804 & 8142 \\
+1913 & 11860 & 10234 \\
+1914 & 13815 & 13111 \\
+\end{longtable}
+
+Looks a lot better! Now, our \texttt{DataFrame} is structured with clear
+index-column combinations. Each entry in the pivot table represents the
+summed count of names for a given combination of \texttt{"Year"} and
+\texttt{"Sex"}.
+
+Let's take a closer look at the code implemented above.
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{index\ =\ "Year"} specifies the column name in the original
+  \texttt{DataFrame} that should be used as the index of the pivot table
+\item
+  \texttt{columns\ =\ "Sex"} specifies the column name in the original
+  \texttt{DataFrame} that should be used to generate the columns of the
+  pivot table
+\item
+  \texttt{values\ =\ "Count"} indicates what values from the original
+  \texttt{DataFrame} should be used to populate the entry for each
+  index-column combination
+\item
+  \texttt{aggfunc\ =\ np.sum} tells \texttt{pandas} what function to use
+  when aggregating the data specified by \texttt{values}. Here, we are
+  summing the name counts for each pair of \texttt{"Year"} and
+  \texttt{"Sex"}
+\end{itemize}
+
+We can even include multiple values in the index or columns of our pivot
+tables.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{babynames\_pivot }\OperatorTok{=}\NormalTok{ babynames.pivot\_table(}
+\NormalTok{    index}\OperatorTok{=}\StringTok{"Year"}\NormalTok{,     }\CommentTok{\# the rows (turned into index)}
+\NormalTok{    columns}\OperatorTok{=}\StringTok{"Sex"}\NormalTok{,    }\CommentTok{\# the column values}
+\NormalTok{    values}\OperatorTok{=}\NormalTok{[}\StringTok{"Count"}\NormalTok{, }\StringTok{"Name"}\NormalTok{], }
+\NormalTok{    aggfunc}\OperatorTok{=}\BuiltInTok{max}\NormalTok{,      }\CommentTok{\# group operation}
+\NormalTok{)}
+\NormalTok{babynames\_pivot.head(}\DecValTok{6}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57880/970182367.py:1: FutureWarning:
+
+The provided callable <built-in function max> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& \multicolumn{2}{l}{%
+Count} & \multicolumn{2}{l@{}}{%
+Name} \\
+Sex & F & M & F & M \\
+Year & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1910 & 295 & 237 & Yvonne & William \\
+1911 & 390 & 214 & Zelma & Willis \\
+1912 & 534 & 501 & Yvonne & Woodrow \\
+1913 & 584 & 614 & Zelma & Yoshio \\
+1914 & 773 & 769 & Zelma & Yoshio \\
+1915 & 998 & 1033 & Zita & Yukio \\
+\end{longtable}
+
+Note that each row provides the number of girls and number of boys
+having that year's most common name, and also lists the alphabetically
+largest girl name and boy name. The counts for number of girls/boys in
+the resulting \texttt{DataFrame} do not correspond to the names listed.
+For example, in 1910, the most popular girl name is given to 295 girls,
+but that name was likely not Yvonne.
+
+\section{Joining Tables}\label{joining-tables}
+
+When working on data science projects, we're unlikely to have absolutely
+all the data we want contained in a single \texttt{DataFrame} -- a
+real-world data scientist needs to grapple with data coming from
+multiple sources. If we have access to multiple datasets with related
+information, we can join two or more tables into a single
+\texttt{DataFrame}.
+
+To put this into practice, we'll revisit the \texttt{elections} dataset.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{elections.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.210122 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.789878 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.203927 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.796073 \\
+4 & 1832 & Andrew Jackson & Democratic & 702735 & win & 54.574789 \\
+\end{longtable}
+
+Say we want to understand the popularity of the names of each
+presidential candidate in 2022. To do this, we'll need the combined data
+of \texttt{babynames} \emph{and} \texttt{elections}.
+
+We'll start by creating a new column containing the first name of each
+presidential candidate. This will help us join each name in
+\texttt{elections} to the corresponding name data in \texttt{babynames}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# This \textasciigrave{}str\textasciigrave{} operation splits each candidate\textquotesingle{}s full name at each }
+\CommentTok{\# blank space, then takes just the candidate\textquotesingle{}s first name}
+\NormalTok{elections[}\StringTok{"First Name"}\NormalTok{] }\OperatorTok{=}\NormalTok{ elections[}\StringTok{"Candidate"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.split().}\BuiltInTok{str}\NormalTok{[}\DecValTok{0}\NormalTok{]}
+\NormalTok{elections.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% & First Name \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.210122 & Andrew \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.789878 & John \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.203927 &
+Andrew \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.796073 & John \\
+4 & 1832 & Andrew Jackson & Democratic & 702735 & win & 54.574789 &
+Andrew \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Here, we\textquotesingle{}ll only consider \textasciigrave{}babynames\textasciigrave{} data from 2022}
+\NormalTok{babynames\_2022 }\OperatorTok{=}\NormalTok{ babynames[babynames[}\StringTok{"Year"}\NormalTok{]}\OperatorTok{==}\DecValTok{2022}\NormalTok{]}
+\NormalTok{babynames\_2022.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& State & Sex & Year & Name & Count & First Letter \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+237964 & CA & F & 2022 & Leandra & 10 & L \\
+404916 & CA & M & 2022 & Leandro & 99 & L \\
+405892 & CA & M & 2022 & Andreas & 14 & A \\
+235927 & CA & F & 2022 & Andrea & 322 & A \\
+405695 & CA & M & 2022 & Deandre & 18 & D \\
+\end{longtable}
+
+Now, we're ready to join the two tables.
+\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html}{\texttt{pd.merge}}
+is the \texttt{pandas} method used to join \texttt{DataFrame}s together.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{merged }\OperatorTok{=}\NormalTok{ pd.merge(left }\OperatorTok{=}\NormalTok{ elections, right }\OperatorTok{=}\NormalTok{ babynames\_2022, }\OperatorTok{\textbackslash{}}
+\NormalTok{                  left\_on }\OperatorTok{=} \StringTok{"First Name"}\NormalTok{, right\_on }\OperatorTok{=} \StringTok{"Name"}\NormalTok{)}
+\NormalTok{merged.head()}
+\CommentTok{\# Notice that pandas automatically specifies \textasciigrave{}Year\_x\textasciigrave{} and \textasciigrave{}Year\_y\textasciigrave{} }
+\CommentTok{\# when both merged DataFrames have the same column name to avoid confusion}
+
+\CommentTok{\# Second option}
+\CommentTok{\# merged = elections.merge(right = babynames\_2022, \textbackslash{}}
+    \CommentTok{\# left\_on = "First Name", right\_on = "Name")}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllllll@{}}
+\toprule\noalign{}
+& Year\_x & Candidate & Party & Popular vote & Result & \% & First Name
+& State & Sex & Year\_y & Name & Count & First Letter \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.210122 & Andrew & CA & M & 2022 & Andrew & 741 & A \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.789878 & John & CA & M & 2022 & John & 490 & J \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.203927 &
+Andrew & CA & M & 2022 & Andrew & 741 & A \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.796073 & John & CA & M & 2022 & John & 490 & J \\
+4 & 1832 & Andrew Jackson & Democratic & 702735 & win & 54.574789 &
+Andrew & CA & M & 2022 & Andrew & 741 & A \\
+\end{longtable}
+
+Let's take a closer look at the parameters:
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{left} and \texttt{right} parameters are used to specify the
+  \texttt{DataFrame}s to be joined.
+\item
+  \texttt{left\_on} and \texttt{right\_on} parameters are assigned to
+  the string names of the columns to be used when performing the join.
+  These two \texttt{on} parameters tell \texttt{pandas} what values
+  should act as pairing keys to determine which rows to merge across the
+  \texttt{DataFrame}s. We'll talk more about this idea of a pairing key
+  next lecture.
+\end{itemize}
+
+\section{Parting Note}\label{parting-note-2}
+
+Congratulations! We finally tackled \texttt{pandas}. Don't worry if you
+are still not feeling very comfortable with it---you will have plenty of
+chances to practice over the next few weeks.
+
+Next, we will get our hands dirty with some real-world datasets and use
+our \texttt{pandas} knowledge to conduct some exploratory data analysis.
+
+\bookmarksetup{startatroot}
+
+\chapter{Data Cleaning and EDA}\label{data-cleaning-and-eda}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\CommentTok{\#\%matplotlib inline}
+\NormalTok{plt.rcParams[}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (}\DecValTok{12}\NormalTok{, }\DecValTok{9}\NormalTok{)}
+
+\NormalTok{sns.}\BuiltInTok{set}\NormalTok{()}
+\NormalTok{sns.set\_context(}\StringTok{\textquotesingle{}talk\textquotesingle{}}\NormalTok{)}
+\NormalTok{np.set\_printoptions(threshold}\OperatorTok{=}\DecValTok{20}\NormalTok{, precision}\OperatorTok{=}\DecValTok{2}\NormalTok{, suppress}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{pd.set\_option(}\StringTok{\textquotesingle{}display.max\_rows\textquotesingle{}}\NormalTok{, }\DecValTok{30}\NormalTok{)}
+\NormalTok{pd.set\_option(}\StringTok{\textquotesingle{}display.max\_columns\textquotesingle{}}\NormalTok{, }\VariableTok{None}\NormalTok{)}
+\NormalTok{pd.set\_option(}\StringTok{\textquotesingle{}display.precision\textquotesingle{}}\NormalTok{, }\DecValTok{2}\NormalTok{)}
+\CommentTok{\# This option stops scientific notation for pandas}
+\NormalTok{pd.set\_option(}\StringTok{\textquotesingle{}display.float\_format\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}}\SpecialCharTok{\{:.2f\}}\StringTok{\textquotesingle{}}\NormalTok{.}\BuiltInTok{format}\NormalTok{)}
+
+\CommentTok{\# Silence some spurious seaborn warnings}
+\ImportTok{import}\NormalTok{ warnings}
+\NormalTok{warnings.filterwarnings(}\StringTok{"ignore"}\NormalTok{, category}\OperatorTok{=}\PreprocessorTok{FutureWarning}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Recognize common file formats
+\item
+  Categorize data by its variable type
+\item
+  Build awareness of issues with data faithfulness and develop targeted
+  solutions
+\end{itemize}
+
+\end{tcolorbox}
+
+In the past few lectures, we've learned that \texttt{pandas} is a
+toolkit to restructure, modify, and explore a dataset. What we haven't
+yet touched on is \emph{how} to make these data transformation
+decisions. When we receive a new set of data from the ``real world,''
+how do we know what processing we should do to convert this data into a
+usable form?
+
+\textbf{Data cleaning}, also called \textbf{data wrangling}, is the
+process of transforming raw data to facilitate subsequent analysis. It
+is often used to address issues like:
+
+\begin{itemize}
+\tightlist
+\item
+  Unclear structure or formatting
+\item
+  Missing or corrupted values
+\item
+  Unit conversions
+\item
+  \ldots and so on
+\end{itemize}
+
+\textbf{Exploratory Data Analysis (EDA)} is the process of understanding
+a new dataset. It is an open-ended, informal analysis that involves
+familiarizing ourselves with the variables present in the data,
+discovering potential hypotheses, and identifying possible issues with
+the data. This last point can often motivate further data cleaning to
+address any problems with the dataset's format; because of this, EDA and
+data cleaning are often thought of as an ``infinite loop,'' with each
+process driving the other.
+
+In this lecture, we will consider the key properties of data to consider
+when performing data cleaning and EDA. In doing so, we'll develop a
+``checklist'' of sorts for you to consider when approaching a new
+dataset. Throughout this process, we'll build a deeper understanding of
+this early (but very important!) stage of the data science lifecycle.
+
+\section{Structure}\label{structure}
+
+We often prefer rectangular data for data analysis. Rectangular
+structures are easy to manipulate and analyze. A key element of data
+cleaning is about transforming data to be more rectangular.
+
+There are two kinds of rectangular data: tables and matrices. Tables
+have named columns with different data types and are manipulated using
+data transformation languages. Matrices contain numeric data of the same
+type and are manipulated using linear algebra.
+
+\subsection{File Formats}\label{file-formats}
+
+There are many file types for storing structured data: TSV, JSON, XML,
+ASCII, SAS, etc. We'll only cover CSV, TSV, and JSON in lecture, but
+you'll likely encounter other formats as you work with different
+datasets. Reading documentation is your best bet for understanding how
+to process the multitude of different file types.
+
+\subsubsection{CSV}\label{csv}
+
+CSVs, which stand for \textbf{Comma-Separated Values}, are a common
+tabular data format. In the past two \texttt{pandas} lectures, we
+briefly touched on the idea of file format: the way data is encoded in a
+file for storage. Specifically, our \texttt{elections} and
+\texttt{babynames} datasets were stored and loaded as CSVs:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.read\_csv(}\StringTok{"data/elections.csv"}\NormalTok{).head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.21 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.79 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.20 \\
+3 & 1828 & John Quincy Adams & National Republican & 500897 & loss &
+43.80 \\
+4 & 1832 & Andrew Jackson & Democratic & 702735 & win & 54.57 \\
+\end{longtable}
+
+To better understand the properties of a CSV, let's take a look at the
+first few rows of the raw data file to see what it looks like before
+being loaded into a \texttt{DataFrame}. We'll use the \texttt{repr()}
+function to return the raw string with its special characters:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{"data/elections.csv"}\NormalTok{, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ table:}
+\NormalTok{    i }\OperatorTok{=} \DecValTok{0}
+    \ControlFlowTok{for}\NormalTok{ row }\KeywordTok{in}\NormalTok{ table:}
+        \BuiltInTok{print}\NormalTok{(}\BuiltInTok{repr}\NormalTok{(row))}
+\NormalTok{        i }\OperatorTok{+=} \DecValTok{1}
+        \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}} \DecValTok{3}\NormalTok{:}
+            \ControlFlowTok{break}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'Year,Candidate,Party,Popular vote,Result,%\n'
+'1824,Andrew Jackson,Democratic-Republican,151271,loss,57.21012204\n'
+'1824,John Quincy Adams,Democratic-Republican,113142,win,42.78987796\n'
+'1828,Andrew Jackson,Democratic,642806,win,56.20392707\n'
+\end{verbatim}
+
+Each row, or \textbf{record}, in the data is delimited by a newline
+\texttt{\textbackslash{}n}. Each column, or \textbf{field}, in the data
+is delimited by a comma \texttt{,} (hence, comma-separated!).
+
+\subsubsection{TSV}\label{tsv}
+
+Another common file type is \textbf{TSV (Tab-Separated Values)}. In a
+TSV, records are still delimited by a newline
+\texttt{\textbackslash{}n}, while fields are delimited by
+\texttt{\textbackslash{}t} tab character.
+
+Let's check out the first few rows of the raw TSV file. Again, we'll use
+the \texttt{repr()} function so that \texttt{print} shows the special
+characters.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{"data/elections.txt"}\NormalTok{, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ table:}
+\NormalTok{    i }\OperatorTok{=} \DecValTok{0}
+    \ControlFlowTok{for}\NormalTok{ row }\KeywordTok{in}\NormalTok{ table:}
+        \BuiltInTok{print}\NormalTok{(}\BuiltInTok{repr}\NormalTok{(row))}
+\NormalTok{        i }\OperatorTok{+=} \DecValTok{1}
+        \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}} \DecValTok{3}\NormalTok{:}
+            \ControlFlowTok{break}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'\ufeffYear\tCandidate\tParty\tPopular vote\tResult\t%\n'
+'1824\tAndrew Jackson\tDemocratic-Republican\t151271\tloss\t57.21012204\n'
+'1824\tJohn Quincy Adams\tDemocratic-Republican\t113142\twin\t42.78987796\n'
+'1828\tAndrew Jackson\tDemocratic\t642806\twin\t56.20392707\n'
+\end{verbatim}
+
+TSVs can be loaded into \texttt{pandas} using \texttt{pd.read\_csv}.
+We'll need to specify the \textbf{delimiter} with
+parameter\texttt{sep=\textquotesingle{}\textbackslash{}t\textquotesingle{}}
+\href{https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html}{(documentation)}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.read\_csv(}\StringTok{"data/elections.txt"}\NormalTok{, sep}\OperatorTok{=}\StringTok{\textquotesingle{}}\CharTok{\textbackslash{}t}\StringTok{\textquotesingle{}}\NormalTok{).head(}\DecValTok{3}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.21 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.79 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.20 \\
+\end{longtable}
+
+An issue with CSVs and TSVs comes up whenever there are commas or tabs
+within the records. How does \texttt{pandas} differentiate between a
+comma delimiter vs.~a comma within the field itself, for example
+\texttt{8,900}? To remedy this, check out the
+\href{https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html}{\texttt{quotechar}
+parameter}.
+
+\subsubsection{JSON}\label{json}
+
+\textbf{JSON (JavaScript Object Notation)} files behave similarly to
+Python dictionaries. A raw JSON is shown below.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{"data/elections.json"}\NormalTok{, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ table:}
+\NormalTok{    i }\OperatorTok{=} \DecValTok{0}
+    \ControlFlowTok{for}\NormalTok{ row }\KeywordTok{in}\NormalTok{ table:}
+        \BuiltInTok{print}\NormalTok{(row)}
+\NormalTok{        i }\OperatorTok{+=} \DecValTok{1}
+        \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}} \DecValTok{8}\NormalTok{:}
+            \ControlFlowTok{break}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+[
+
+ {
+
+   "Year": 1824,
+
+   "Candidate": "Andrew Jackson",
+
+   "Party": "Democratic-Republican",
+
+   "Popular vote": 151271,
+
+   "Result": "loss",
+
+   "%": 57.21012204
+
+ },
+\end{verbatim}
+
+JSON files can be loaded into \texttt{pandas} using
+\texttt{pd.read\_json}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.read\_json(}\StringTok{\textquotesingle{}data/elections.json\textquotesingle{}}\NormalTok{).head(}\DecValTok{3}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& Year & Candidate & Party & Popular vote & Result & \% \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1824 & Andrew Jackson & Democratic-Republican & 151271 & loss &
+57.21 \\
+1 & 1824 & John Quincy Adams & Democratic-Republican & 113142 & win &
+42.79 \\
+2 & 1828 & Andrew Jackson & Democratic & 642806 & win & 56.20 \\
+\end{longtable}
+
+\paragraph{EDA with JSON: Berkeley COVID-19
+Data}\label{eda-with-json-berkeley-covid-19-data}
+
+The City of Berkeley Open Data
+\href{https://data.cityofberkeley.info/Health/COVID-19-Confirmed-Cases/xn6j-b766}{website}
+has a dataset with COVID-19 Confirmed Cases among Berkeley residents by
+date. Let's download the file and save it as a JSON (note the source URL
+file type is also a JSON). In the interest of reproducible data science,
+we will download the data programatically. We have defined some helper
+functions in the
+\href{https://ds100.org/fa23/resources/assets/lectures/lec05/lec05-eda.html}{\texttt{ds100\_utils.py}}
+file that we can reuse these helper functions in many different
+notebooks.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ ds100\_utils }\ImportTok{import}\NormalTok{ fetch\_and\_cache}
+
+\NormalTok{covid\_file }\OperatorTok{=}\NormalTok{ fetch\_and\_cache(}
+    \StringTok{"https://data.cityofberkeley.info/api/views/xn6j{-}b766/rows.json?accessType=DOWNLOAD"}\NormalTok{,}
+    \StringTok{"confirmed{-}cases.json"}\NormalTok{,}
+\NormalTok{    force}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+\NormalTok{covid\_file          }\CommentTok{\# a file path wrapper object}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Using cached version that was downloaded (UTC): Tue Aug 27 03:33:01 2024
+\end{verbatim}
+
+\begin{verbatim}
+PosixPath('data/confirmed-cases.json')
+\end{verbatim}
+
+\subparagraph{File Size}\label{file-size}
+
+Let's start our analysis by getting a rough estimate of the size of the
+dataset to inform the tools we use to view the data. For relatively
+small datasets, we can use a text editor or spreadsheet. For larger
+datasets, more programmatic exploration or distributed computing tools
+may be more fitting. Here we will use \texttt{Python} tools to probe the
+file.
+
+Since there seem to be text files, let's investigate the number of
+lines, which often corresponds to the number of records
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ os}
+
+\BuiltInTok{print}\NormalTok{(covid\_file, }\StringTok{"is"}\NormalTok{, os.path.getsize(covid\_file) }\OperatorTok{/} \FloatTok{1e6}\NormalTok{, }\StringTok{"MB"}\NormalTok{)}
+
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(covid\_file, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+    \BuiltInTok{print}\NormalTok{(covid\_file, }\StringTok{"is"}\NormalTok{, }\BuiltInTok{sum}\NormalTok{(}\DecValTok{1} \ControlFlowTok{for}\NormalTok{ l }\KeywordTok{in}\NormalTok{ f), }\StringTok{"lines."}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+data/confirmed-cases.json is 0.116367 MB
+data/confirmed-cases.json is 1110 lines.
+\end{verbatim}
+
+\subparagraph{Unix Commands}\label{unix-commands}
+
+As part of the EDA workflow, Unix commands can come in very handy. In
+fact, there's an entire book called
+\href{https://datascienceatthecommandline.com/}{``Data Science at the
+Command Line''} that explores this idea in depth! In Jupyter/IPython,
+you can prefix lines with \texttt{!} to execute arbitrary Unix commands,
+and within those lines, you can refer to Python variables and
+expressions with the syntax \texttt{\{expr\}}.
+
+Here, we use the \texttt{ls} command to list files, using the
+\texttt{-lh} flags, which request ``long format with information in
+human-readable form.'' We also use the \texttt{wc} command for ``word
+count,'' but with the \texttt{-l} flag, which asks for line counts
+instead of words.
+
+These two give us the same information as the code above, albeit in a
+slightly different form:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{!}\NormalTok{ls }\OperatorTok{{-}}\NormalTok{lh \{covid\_file\}}
+\OperatorTok{!}\NormalTok{wc }\OperatorTok{{-}}\NormalTok{l \{covid\_file\}}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+-rw-r--r--  1 jianingding21  staff   114K Aug 27 03:33 data/confirmed-cases.json
+    1109 data/confirmed-cases.json
+\end{verbatim}
+
+\subparagraph{File Contents}\label{file-contents}
+
+Let's explore the data format using \texttt{Python}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(covid\_file, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+    \ControlFlowTok{for}\NormalTok{ i, row }\KeywordTok{in} \BuiltInTok{enumerate}\NormalTok{(f):}
+        \BuiltInTok{print}\NormalTok{(}\BuiltInTok{repr}\NormalTok{(row)) }\CommentTok{\# print raw strings}
+        \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}=} \DecValTok{4}\NormalTok{: }\ControlFlowTok{break}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'{\n'
+'  "meta" : {\n'
+'    "view" : {\n'
+'      "id" : "xn6j-b766",\n'
+'      "name" : "COVID-19 Confirmed Cases",\n'
+\end{verbatim}
+
+We can use the \texttt{head} Unix command (which is where
+\texttt{pandas}' \texttt{head} method comes from!) to see the first few
+lines of the file:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{!}\NormalTok{head }\OperatorTok{{-}}\DecValTok{5}\NormalTok{ \{covid\_file\}}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+{
+  "meta" : {
+    "view" : {
+      "id" : "xn6j-b766",
+      "name" : "COVID-19 Confirmed Cases",
+\end{verbatim}
+
+In order to load the JSON file into \texttt{pandas}, Let's first do some
+EDA with Oython's \texttt{json} package to understand the particular
+structure of this JSON file so that we can decide what (if anything) to
+load into \texttt{pandas}. Python has relatively good support for JSON
+data since it closely matches the internal python object model. In the
+following cell we import the entire JSON datafile into a python
+dictionary using the \texttt{json} package.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ json}
+
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(covid\_file, }\StringTok{"rb"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+\NormalTok{    covid\_json }\OperatorTok{=}\NormalTok{ json.load(f)}
+\end{Highlighting}
+\end{Shaded}
+
+The \texttt{covid\_json} variable is now a dictionary encoding the data
+in the file:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\BuiltInTok{type}\NormalTok{(covid\_json)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+dict
+\end{verbatim}
+
+We can examine what keys are in the top level JSON object by listing out
+the keys.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{covid\_json.keys()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+dict_keys(['meta', 'data'])
+\end{verbatim}
+
+\textbf{Observation}: The JSON dictionary contains a \texttt{meta} key
+which likely refers to metadata (data about the data). Metadata is often
+maintained with the data and can be a good source of additional
+information.
+
+We can investigate the metadata further by examining the keys associated
+with the metadata.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{covid\_json[}\StringTok{\textquotesingle{}meta\textquotesingle{}}\NormalTok{].keys()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+dict_keys(['view'])
+\end{verbatim}
+
+The \texttt{meta} key contains another dictionary called \texttt{view}.
+This likely refers to metadata about a particular ``view'' of some
+underlying database. We will learn more about views when we study SQL
+later in the class.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{covid\_json[}\StringTok{\textquotesingle{}meta\textquotesingle{}}\NormalTok{][}\StringTok{\textquotesingle{}view\textquotesingle{}}\NormalTok{].keys()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+dict_keys(['id', 'name', 'assetType', 'attribution', 'averageRating', 'category', 'createdAt', 'description', 'displayType', 'downloadCount', 'hideFromCatalog', 'hideFromDataJson', 'newBackend', 'numberOfComments', 'oid', 'provenance', 'publicationAppendEnabled', 'publicationDate', 'publicationGroup', 'publicationStage', 'rowsUpdatedAt', 'rowsUpdatedBy', 'tableId', 'totalTimesRated', 'viewCount', 'viewLastModified', 'viewType', 'approvals', 'columns', 'grants', 'metadata', 'owner', 'query', 'rights', 'tableAuthor', 'tags', 'flags'])
+\end{verbatim}
+
+Notice that this a nested/recursive data structure. As we dig deeper we
+reveal more and more keys and the corresponding data:
+
+\begin{verbatim}
+meta
+|-> data
+    | ... (haven't explored yet)
+|-> view
+    | -> id
+    | -> name
+    | -> attribution 
+    ...
+    | -> description
+    ...
+    | -> columns
+    ...
+\end{verbatim}
+
+There is a key called description in the view sub dictionary. This
+likely contains a description of the data:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\BuiltInTok{print}\NormalTok{(covid\_json[}\StringTok{\textquotesingle{}meta\textquotesingle{}}\NormalTok{][}\StringTok{\textquotesingle{}view\textquotesingle{}}\NormalTok{][}\StringTok{\textquotesingle{}description\textquotesingle{}}\NormalTok{])}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Counts of confirmed COVID-19 cases among Berkeley residents by date.
+\end{verbatim}
+
+\subparagraph{Examining the Data Field for
+Records}\label{examining-the-data-field-for-records}
+
+We can look at a few entries in the \texttt{data} field. This is what
+we'll load into \texttt{pandas}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{3}\NormalTok{):}
+    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"}\SpecialCharTok{\{}\NormalTok{i}\SpecialCharTok{:03\}}\SpecialStringTok{ | }\SpecialCharTok{\{}\NormalTok{covid\_json[}\StringTok{\textquotesingle{}data\textquotesingle{}}\NormalTok{][i]}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+000 | ['row-kzbg.v7my-c3y2', '00000000-0000-0000-0405-CB14DE51DAA7', 0, 1643733903, None, 1643733903, None, '{ }', '2020-02-28T00:00:00', '1', '1']
+001 | ['row-jkyx_9u4r-h2yw', '00000000-0000-0000-F806-86D0DBE0E17F', 0, 1643733903, None, 1643733903, None, '{ }', '2020-02-29T00:00:00', '0', '1']
+002 | ['row-qifg_4aug-y3ym', '00000000-0000-0000-2DCE-4D1872F9B216', 0, 1643733903, None, 1643733903, None, '{ }', '2020-03-01T00:00:00', '0', '1']
+\end{verbatim}
+
+Observations: * These look like equal-length records, so maybe
+\texttt{data} is a table! * But what do each of values in the record
+mean? Where can we find column headers?
+
+For that, we'll need the \texttt{columns} key in the metadata
+dictionary. This returns a list:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\BuiltInTok{type}\NormalTok{(covid\_json[}\StringTok{\textquotesingle{}meta\textquotesingle{}}\NormalTok{][}\StringTok{\textquotesingle{}view\textquotesingle{}}\NormalTok{][}\StringTok{\textquotesingle{}columns\textquotesingle{}}\NormalTok{])}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+list
+\end{verbatim}
+
+\subparagraph{Summary of exploring the JSON
+file}\label{summary-of-exploring-the-json-file}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  The above \textbf{metadata} tells us a lot about the columns in the
+  data including column names, potential data anomalies, and a basic
+  statistic.
+\item
+  Because of its non-tabular structure, JSON makes it easier (than CSV)
+  to create \textbf{self-documenting data}, meaning that information
+  about the data is stored in the same file as the data.
+\item
+  Self-documenting data can be helpful since it maintains its own
+  description and these descriptions are more likely to be updated as
+  data changes.
+\end{enumerate}
+
+\subparagraph{\texorpdfstring{Loading COVID Data into
+\texttt{pandas}}{Loading COVID Data into pandas}}\label{loading-covid-data-into-pandas}
+
+Finally, let's load the data (not the metadata) into a \texttt{pandas}
+\texttt{DataFrame}. In the following block of code we:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  Translate the JSON records into a \texttt{DataFrame}:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    fields:
+    \texttt{covid\_json{[}\textquotesingle{}meta\textquotesingle{}{]}{[}\textquotesingle{}view\textquotesingle{}{]}{[}\textquotesingle{}columns\textquotesingle{}{]}}
+  \item
+    records:
+    \texttt{covid\_json{[}\textquotesingle{}data\textquotesingle{}{]}}
+  \end{itemize}
+\item
+  Remove columns that have no metadata description. This would be a bad
+  idea in general, but here we remove these columns since the above
+  analysis suggests they are unlikely to contain useful information.
+\item
+  Examine the \texttt{tail} of the table.
+\end{enumerate}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Load the data from JSON and assign column titles}
+\NormalTok{covid }\OperatorTok{=}\NormalTok{ pd.DataFrame(}
+\NormalTok{    covid\_json[}\StringTok{\textquotesingle{}data\textquotesingle{}}\NormalTok{],}
+\NormalTok{    columns}\OperatorTok{=}\NormalTok{[c[}\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{] }\ControlFlowTok{for}\NormalTok{ c }\KeywordTok{in}\NormalTok{ covid\_json[}\StringTok{\textquotesingle{}meta\textquotesingle{}}\NormalTok{][}\StringTok{\textquotesingle{}view\textquotesingle{}}\NormalTok{][}\StringTok{\textquotesingle{}columns\textquotesingle{}}\NormalTok{]])}
+
+\NormalTok{covid.tail()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllll@{}}
+\toprule\noalign{}
+& sid & id & position & created\_at & created\_meta & updated\_at &
+updated\_meta & meta & Date & New Cases & Cumulative Cases \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+699 & row-49b6\_x8zv.gyum & 00000000-0000-0000-A18C-9174A6D05774 & 0 &
+1643733903 & None & 1643733903 & None & \{ \} & 2022-01-27T00:00:00 &
+106 & 10694 \\
+700 & row-gs55-p5em.y4v9 & 00000000-0000-0000-F41D-5724AEABB4D6 & 0 &
+1643733903 & None & 1643733903 & None & \{ \} & 2022-01-28T00:00:00 &
+223 & 10917 \\
+701 & row-3pyj.tf95-qu67 & 00000000-0000-0000-BEE3-B0188D2518BD & 0 &
+1643733903 & None & 1643733903 & None & \{ \} & 2022-01-29T00:00:00 &
+139 & 11056 \\
+702 & row-cgnd.8syv.jvjn & 00000000-0000-0000-C318-63CF75F7F740 & 0 &
+1643733903 & None & 1643733903 & None & \{ \} & 2022-01-30T00:00:00 & 33
+& 11089 \\
+703 & row-qywv\_24x6-237y & 00000000-0000-0000-FE92-9789FED3AA20 & 0 &
+1643733903 & None & 1643733903 & None & \{ \} & 2022-01-31T00:00:00 & 42
+& 11131 \\
+\end{longtable}
+
+\subsection{Primary and Foreign Keys}\label{primary-and-foreign-keys}
+
+Last time, we introduced \texttt{.merge} as the \texttt{pandas} method
+for joining multiple \texttt{DataFrame}s together. In our discussion of
+joins, we touched on the idea of using a ``key'' to determine what rows
+should be merged from each table. Let's take a moment to examine this
+idea more closely.
+
+The \textbf{primary key} is the column or set of columns in a table that
+\emph{uniquely} determine the values of the remaining columns. It can be
+thought of as the unique identifier for each individual row in the
+table. For example, a table of Data 100 students might use each
+student's Cal ID as the primary key.
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& Cal ID & Name & Major \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 3034619471 & Oski & Data Science \\
+1 & 3035619472 & Ollie & Computer Science \\
+2 & 3025619473 & Orrie & Data Science \\
+3 & 3046789372 & Ollie & Economics \\
+\end{longtable}
+
+The \textbf{foreign key} is the column or set of columns in a table that
+reference primary keys in other tables. Knowing a dataset's foreign keys
+can be useful when assigning the \texttt{left\_on} and
+\texttt{right\_on} parameters of \texttt{.merge}. In the table of office
+hour tickets below, \texttt{"Cal\ ID"} is a foreign key referencing the
+previous table.
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& OH Request & Cal ID & Question \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1 & 3034619471 & HW 2 Q1 \\
+1 & 2 & 3035619472 & HW 2 Q3 \\
+2 & 3 & 3025619473 & Lab 3 Q4 \\
+3 & 4 & 3035619472 & HW 2 Q7 \\
+\end{longtable}
+
+\subsection{Variable Types}\label{variable-types}
+
+Variables are columns. A variable is a measurement of a particular
+concept. Variables have two common properties: data type/storage type
+and variable type/feature type. The data type of a variable indicates
+how each variable value is stored in memory (integer, floating point,
+boolean, etc.) and affects which \texttt{pandas} functions are used. The
+variable type is a conceptualized measurement of information (and
+therefore indicates what values a variable can take on). Variable type
+is identified through expert knowledge, exploring the data itself, or
+consulting the data codebook. The variable type affects how one
+visualizes and inteprets the data. In this class, ``variable types'' are
+conceptual.
+
+After loading data into a file, it's a good idea to take the time to
+understand what pieces of information are encoded in the dataset. In
+particular, we want to identify what variable types are present in our
+data. Broadly speaking, we can categorize variables into one of two
+overarching types.
+
+\textbf{Quantitative variables} describe some numeric quantity or
+amount. We can divide quantitative data further into:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Continuous quantitative variables}: numeric data that can be
+  measured on a continuous scale to arbitrary precision. Continuous
+  variables do not have a strict set of possible values -- they can be
+  recorded to any number of decimal places. For example, weights, GPA,
+  or CO2 concentrations.
+\item
+  \textbf{Discrete quantitative variables}: numeric data that can only
+  take on a finite set of possible values. For example, someone's age or
+  the number of siblings they have.
+\end{itemize}
+
+\textbf{Qualitative variables}, also known as \textbf{categorical
+variables}, describe data that isn't measuring some quantity or amount.
+The sub-categories of categorical data are:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Ordinal qualitative variables}: categories with ordered
+  levels. Specifically, ordinal variables are those where the difference
+  between levels has no consistent, quantifiable meaning. Some examples
+  include levels of education (high school, undergrad, grad, etc.),
+  income bracket (low, medium, high), or Yelp rating.
+\item
+  \textbf{Nominal qualitative variables}: categories with no specific
+  order. For example, someone's political affiliation or Cal ID number.
+\end{itemize}
+
+\begin{figure}[H]
+
+{\centering \includegraphics{eda/images/variable.png}
+
+}
+
+\caption{Classification of variable types}
+
+\end{figure}%
+
+Note that many variables don't sit neatly in just one of these
+categories. Qualitative variables could have numeric levels, and
+conversely, quantitative variables could be stored as strings.
+
+\section{Granularity, Scope, and
+Temporality}\label{granularity-scope-and-temporality}
+
+After understanding the structure of the dataset, the next task is to
+determine what exactly the data represents. We'll do so by considering
+the data's granularity, scope, and temporality.
+
+\subsection{Granularity}\label{granularity}
+
+The \textbf{granularity} of a dataset is what a single row represents.
+You can also think of it as the level of detail included in the data. To
+determine the data's granularity, ask: what does each row in the dataset
+represent? Fine-grained data contains a high level of detail, with a
+single row representing a small individual unit. For example, each
+record may represent one person. Coarse-grained data is encoded such
+that a single row represents a large individual unit -- for example,
+each record may represent a group of people.
+
+\subsection{Scope}\label{scope}
+
+The \textbf{scope} of a dataset is the subset of the population covered
+by the data. If we were investigating student performance in Data
+Science courses, a dataset with a narrow scope might encompass all
+students enrolled in Data 100 whereas a dataset with an expansive scope
+might encompass all students in California.
+
+\subsection{Temporality}\label{temporality}
+
+The \textbf{temporality} of a dataset describes the periodicity over
+which the data was collected as well as when the data was most recently
+collected or updated.
+
+Time and date fields of a dataset could represent a few things:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  when the ``event'' happened
+\item
+  when the data was collected, or when it was entered into the system
+\item
+  when the data was copied into the database
+\end{enumerate}
+
+To fully understand the temporality of the data, it also may be
+necessary to standardize time zones or inspect recurring time-based
+trends in the data (do patterns recur in 24-hour periods? Over the
+course of a month? Seasonally?). The convention for standardizing time
+is the Coordinated Universal Time (UTC), an international time standard
+measured at 0 degrees latitude that stays consistent throughout the year
+(no daylight savings). We can represent Berkeley's time zone, Pacific
+Standard Time (PST), as UTC-7 (with daylight savings).
+
+\subsubsection{\texorpdfstring{Temporality with \texttt{pandas}'
+\texttt{dt}
+accessors}{Temporality with pandas' dt accessors}}\label{temporality-with-pandas-dt-accessors}
+
+Let's briefly look at how we can use \texttt{pandas}' \texttt{dt}
+accessors to work with dates/times in a dataset using the dataset you'll
+see in Lab 3: the Berkeley PD Calls for Service dataset.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{calls }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/Berkeley\_PD\_{-}\_Calls\_for\_Service.csv"}\NormalTok{)}
+\NormalTok{calls.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllll@{}}
+\toprule\noalign{}
+& CASENO & OFFENSE & EVENTDT & EVENTTM & CVLEGEND & CVDOW & InDbDate &
+Block\_Location & BLKADDR & City & State \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 21014296 & THEFT MISD. (UNDER \$950) & 04/01/2021 12:00:00 AM &
+10:58 & LARCENY & 4 & 06/15/2021 12:00:00 AM & Berkeley,
+CA\textbackslash n(37.869058, -122.270455) & NaN & Berkeley & CA \\
+1 & 21014391 & THEFT MISD. (UNDER \$950) & 04/01/2021 12:00:00 AM &
+10:38 & LARCENY & 4 & 06/15/2021 12:00:00 AM & Berkeley,
+CA\textbackslash n(37.869058, -122.270455) & NaN & Berkeley & CA \\
+2 & 21090494 & THEFT MISD. (UNDER \$950) & 04/19/2021 12:00:00 AM &
+12:15 & LARCENY & 1 & 06/15/2021 12:00:00 AM & 2100 BLOCK HASTE
+ST\textbackslash nBerkeley, CA\textbackslash n(37.864908,... & 2100
+BLOCK HASTE ST & Berkeley & CA \\
+3 & 21090204 & THEFT FELONY (OVER \$950) & 02/13/2021 12:00:00 AM &
+17:00 & LARCENY & 6 & 06/15/2021 12:00:00 AM & 2600 BLOCK WARRING
+ST\textbackslash nBerkeley, CA\textbackslash n(37.86393... & 2600 BLOCK
+WARRING ST & Berkeley & CA \\
+4 & 21090179 & BURGLARY AUTO & 02/08/2021 12:00:00 AM & 6:20 & BURGLARY
+- VEHICLE & 1 & 06/15/2021 12:00:00 AM & 2700 BLOCK GARBER
+ST\textbackslash nBerkeley, CA\textbackslash n(37.86066,... & 2700 BLOCK
+GARBER ST & Berkeley & CA \\
+\end{longtable}
+
+Looks like there are three columns with dates/times: \texttt{EVENTDT},
+\texttt{EVENTTM}, and \texttt{InDbDate}.
+
+Most likely, \texttt{EVENTDT} stands for the date when the event took
+place, \texttt{EVENTTM} stands for the time of day the event took place
+(in 24-hr format), and \texttt{InDbDate} is the date this call is
+recorded onto the database.
+
+If we check the data type of these columns, we will see they are stored
+as strings. We can convert them to \texttt{datetime} objects using
+pandas \texttt{to\_datetime} function.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{calls[}\StringTok{"EVENTDT"}\NormalTok{] }\OperatorTok{=}\NormalTok{ pd.to\_datetime(calls[}\StringTok{"EVENTDT"}\NormalTok{])}
+\NormalTok{calls.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+/var/folders/m7/89sj44pj21ddhplt2bn4qjcm0000gr/T/ipykernel_57962/874729699.py:1: UserWarning:
+
+Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
+\end{verbatim}
+
+\begin{longtable}[]{@{}llllllllllll@{}}
+\toprule\noalign{}
+& CASENO & OFFENSE & EVENTDT & EVENTTM & CVLEGEND & CVDOW & InDbDate &
+Block\_Location & BLKADDR & City & State \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 21014296 & THEFT MISD. (UNDER \$950) & 2021-04-01 & 10:58 & LARCENY
+& 4 & 06/15/2021 12:00:00 AM & Berkeley, CA\textbackslash n(37.869058,
+-122.270455) & NaN & Berkeley & CA \\
+1 & 21014391 & THEFT MISD. (UNDER \$950) & 2021-04-01 & 10:38 & LARCENY
+& 4 & 06/15/2021 12:00:00 AM & Berkeley, CA\textbackslash n(37.869058,
+-122.270455) & NaN & Berkeley & CA \\
+2 & 21090494 & THEFT MISD. (UNDER \$950) & 2021-04-19 & 12:15 & LARCENY
+& 1 & 06/15/2021 12:00:00 AM & 2100 BLOCK HASTE
+ST\textbackslash nBerkeley, CA\textbackslash n(37.864908,... & 2100
+BLOCK HASTE ST & Berkeley & CA \\
+3 & 21090204 & THEFT FELONY (OVER \$950) & 2021-02-13 & 17:00 & LARCENY
+& 6 & 06/15/2021 12:00:00 AM & 2600 BLOCK WARRING
+ST\textbackslash nBerkeley, CA\textbackslash n(37.86393... & 2600 BLOCK
+WARRING ST & Berkeley & CA \\
+4 & 21090179 & BURGLARY AUTO & 2021-02-08 & 6:20 & BURGLARY - VEHICLE &
+1 & 06/15/2021 12:00:00 AM & 2700 BLOCK GARBER
+ST\textbackslash nBerkeley, CA\textbackslash n(37.86066,... & 2700 BLOCK
+GARBER ST & Berkeley & CA \\
+\end{longtable}
+
+Now, we can use the \texttt{dt} accessor on this column.
+
+We can get the month:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{calls[}\StringTok{"EVENTDT"}\NormalTok{].dt.month.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0    4
+1    4
+2    4
+3    2
+4    2
+Name: EVENTDT, dtype: int32
+\end{verbatim}
+
+Which day of the week the date is on:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{calls[}\StringTok{"EVENTDT"}\NormalTok{].dt.dayofweek.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0    3
+1    3
+2    0
+3    5
+4    0
+Name: EVENTDT, dtype: int32
+\end{verbatim}
+
+Check the mimimum values to see if there are any suspicious-looking, 70s
+dates:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{calls.sort\_values(}\StringTok{"EVENTDT"}\NormalTok{).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllll@{}}
+\toprule\noalign{}
+& CASENO & OFFENSE & EVENTDT & EVENTTM & CVLEGEND & CVDOW & InDbDate &
+Block\_Location & BLKADDR & City & State \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+2513 & 20057398 & BURGLARY COMMERCIAL & 2020-12-17 & 16:05 & BURGLARY -
+COMMERCIAL & 4 & 06/15/2021 12:00:00 AM & 600 BLOCK GILMAN
+ST\textbackslash nBerkeley, CA\textbackslash n(37.878405,... & 600 BLOCK
+GILMAN ST & Berkeley & CA \\
+624 & 20057207 & ASSAULT/BATTERY MISD. & 2020-12-17 & 16:50 & ASSAULT &
+4 & 06/15/2021 12:00:00 AM & 2100 BLOCK SHATTUCK
+AVE\textbackslash nBerkeley, CA\textbackslash n(37.871... & 2100 BLOCK
+SHATTUCK AVE & Berkeley & CA \\
+154 & 20092214 & THEFT FROM AUTO & 2020-12-17 & 18:30 & LARCENY - FROM
+VEHICLE & 4 & 06/15/2021 12:00:00 AM & 800 BLOCK SHATTUCK
+AVE\textbackslash nBerkeley, CA\textbackslash n(37.8918... & 800 BLOCK
+SHATTUCK AVE & Berkeley & CA \\
+659 & 20057324 & THEFT MISD. (UNDER \$950) & 2020-12-17 & 15:44 &
+LARCENY & 4 & 06/15/2021 12:00:00 AM & 1800 BLOCK 4TH
+ST\textbackslash nBerkeley, CA\textbackslash n(37.869888, -... & 1800
+BLOCK 4TH ST & Berkeley & CA \\
+993 & 20057573 & BURGLARY RESIDENTIAL & 2020-12-17 & 22:15 & BURGLARY -
+RESIDENTIAL & 4 & 06/15/2021 12:00:00 AM & 1700 BLOCK STUART
+ST\textbackslash nBerkeley, CA\textbackslash n(37.857495... & 1700 BLOCK
+STUART ST & Berkeley & CA \\
+\end{longtable}
+
+Doesn't look like it! We are good!
+
+We can also do many things with the \texttt{dt} accessor like switching
+time zones and converting time back to UNIX/POSIX time. Check out the
+documentation on
+\href{https://pandas.pydata.org/docs/user_guide/basics.html\#basics-dt-accessors}{\texttt{.dt}
+accessor} and
+\href{https://pandas.pydata.org/docs/user_guide/timeseries.html\#}{time
+series/date functionality}.
+
+\section{Faithfulness}\label{faithfulness}
+
+At this stage in our data cleaning and EDA workflow, we've achieved
+quite a lot: we've identified how our data is structured, come to terms
+with what information it encodes, and gained insight as to how it was
+generated. Throughout this process, we should always recall the original
+intent of our work in Data Science -- to use data to better understand
+and model the real world. To achieve this goal, we need to ensure that
+the data we use is faithful to reality; that is, that our data
+accurately captures the ``real world.''
+
+Data used in research or industry is often ``messy'' -- there may be
+errors or inaccuracies that impact the faithfulness of the dataset.
+Signs that data may not be faithful include:
+
+\begin{itemize}
+\tightlist
+\item
+  Unrealistic or ``incorrect'' values, such as negative counts,
+  locations that don't exist, or dates set in the future
+\item
+  Violations of obvious dependencies, like an age that does not match a
+  birthday
+\item
+  Clear signs that data was entered by hand, which can lead to spelling
+  errors or fields that are incorrectly shifted
+\item
+  Signs of data falsification, such as fake email addresses or repeated
+  use of the same names
+\item
+  Duplicated records or fields containing the same information
+\item
+  Truncated data, e.g.~Microsoft Excel would limit the number of rows to
+  655536 and the number of columns to 255
+\end{itemize}
+
+We often solve some of these more common issues in the following ways:
+
+\begin{itemize}
+\tightlist
+\item
+  Spelling errors: apply corrections or drop records that aren't in a
+  dictionary
+\item
+  Time zone inconsistencies: convert to a common time zone (e.g.~UTC)
+\item
+  Duplicated records or fields: identify and eliminate duplicates (using
+  primary keys)
+\item
+  Unspecified or inconsistent units: infer the units and check that
+  values are in reasonable ranges in the data
+\end{itemize}
+
+\subsection{Missing Values}\label{missing-values}
+
+Another common issue encountered with real-world datasets is that of
+missing data. One strategy to resolve this is to simply drop any records
+with missing values from the dataset. This does, however, introduce the
+risk of inducing biases -- it is possible that the missing or corrupt
+records may be systemically related to some feature of interest in the
+data. Another solution is to keep the data as \texttt{NaN} values.
+
+A third method to address missing data is to perform
+\textbf{imputation}: infer the missing values using other data available
+in the dataset. There is a wide variety of imputation techniques that
+can be implemented; some of the most common are listed below.
+
+\begin{itemize}
+\tightlist
+\item
+  Average imputation: replace missing values with the average value for
+  that field
+\item
+  Hot deck imputation: replace missing values with some random value
+\item
+  Regression imputation: develop a model to predict missing values and
+  replace with the predicted value from the model.
+\item
+  Multiple imputation: replace missing values with multiple random
+  values
+\end{itemize}
+
+Regardless of the strategy used to deal with missing data, we should
+think carefully about \emph{why} particular records or fields may be
+missing -- this can help inform whether or not the absence of these
+values is significant or meaningful.
+
+\section{EDA Demo 1: Tuberculosis in the United
+States}\label{eda-demo-1-tuberculosis-in-the-united-states}
+
+Now, let's walk through the data-cleaning and EDA workflow to see what
+can we learn about the presence of Tuberculosis in the United States!
+
+We will examine the data included in the
+\href{https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w\#T1_down}{original
+CDC article} published in 2021.
+
+\subsection{CSVs and Field Names}\label{csvs-and-field-names}
+
+Suppose Table 1 was saved as a CSV file located in
+\texttt{data/cdc\_tuberculosis.csv}.
+
+We can then explore the CSV (which is a text file, and does not contain
+binary-encoded data) in many ways: 1. Using a text editor like emacs,
+vim, VSCode, etc. 2. Opening the CSV directly in DataHub (read-only),
+Excel, Google Sheets, etc. 3. The \texttt{Python} file object 4.
+\texttt{pandas}, using \texttt{pd.read\_csv()}
+
+To try out options 1 and 2, you can view or download the Tuberculosis
+from the
+\href{https://data100.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https\%3A\%2F\%2Fgithub.com\%2FDS-100\%2Ffa23-student&urlpath=lab\%2Ftree\%2Ffa23-student\%2Flecture\%2Flec05\%2Flec04-eda.ipynb&branch=main}{lecture
+demo notebook} under the \texttt{data} folder in the left hand menu.
+Notice how the CSV file is a type of \textbf{rectangular data (i.e.,
+tabular data) stored as comma-separated values}.
+
+Next, let's try out option 3 using the \texttt{Python} file object.
+We'll look at the first four lines:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+\NormalTok{    i }\OperatorTok{=} \DecValTok{0}
+    \ControlFlowTok{for}\NormalTok{ row }\KeywordTok{in}\NormalTok{ f:}
+        \BuiltInTok{print}\NormalTok{(row)}
+\NormalTok{        i }\OperatorTok{+=} \DecValTok{1}
+        \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}} \DecValTok{3}\NormalTok{:}
+            \ControlFlowTok{break}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+,No. of TB cases,,,TB incidence,,
+
+U.S. jurisdiction,2019,2020,2021,2019,2020,2021
+
+Total,"8,900","7,173","7,860",2.71,2.16,2.37
+
+Alabama,87,72,92,1.77,1.43,1.83
+\end{verbatim}
+
+Whoa, why are there blank lines interspaced between the lines of the
+CSV?
+
+You may recall that all line breaks in text files are encoded as the
+special newline character \texttt{\textbackslash{}n}. Python's
+\texttt{print()} prints each string (including the newline), and an
+additional newline on top of that.
+
+If you're curious, we can use the \texttt{repr()} function to return the
+raw string with all special characters:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+\NormalTok{    i }\OperatorTok{=} \DecValTok{0}
+    \ControlFlowTok{for}\NormalTok{ row }\KeywordTok{in}\NormalTok{ f:}
+        \BuiltInTok{print}\NormalTok{(}\BuiltInTok{repr}\NormalTok{(row)) }\CommentTok{\# print raw strings}
+\NormalTok{        i }\OperatorTok{+=} \DecValTok{1}
+        \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}} \DecValTok{3}\NormalTok{:}
+            \ControlFlowTok{break}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+',No. of TB cases,,,TB incidence,,\n'
+'U.S. jurisdiction,2019,2020,2021,2019,2020,2021\n'
+'Total,"8,900","7,173","7,860",2.71,2.16,2.37\n'
+'Alabama,87,72,92,1.77,1.43,1.83\n'
+\end{verbatim}
+
+Finally, let's try option 4 and use the tried-and-true Data 100
+approach: \texttt{pandas}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{)}
+\NormalTok{tb\_df.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& Unnamed: 0 & No. of TB cases & Unnamed: 2 & Unnamed: 3 & TB incidence
+& Unnamed: 5 & Unnamed: 6 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & U.S. jurisdiction & 2019 & 2020 & 2021 & 2019.00 & 2020.00 &
+2021.00 \\
+1 & Total & 8,900 & 7,173 & 7,860 & 2.71 & 2.16 & 2.37 \\
+2 & Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 \\
+3 & Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 \\
+4 & Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 \\
+\end{longtable}
+
+You may notice some strange things about this table: what's up with the
+``Unnamed'' column names and the first row?
+
+Congratulations --- you're ready to wrangle your data! Because of how
+things are stored, we'll need to clean the data a bit to name our
+columns better.
+
+A reasonable first step is to identify the row with the right header.
+The \texttt{pd.read\_csv()} function
+(\href{https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html}{documentation})
+has the convenient \texttt{header} parameter that we can set to use the
+elements in row 1 as the appropriate columns:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{, header}\OperatorTok{=}\DecValTok{1}\NormalTok{) }\CommentTok{\# row index}
+\NormalTok{tb\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& U.S. jurisdiction & 2019 & 2020 & 2021 & 2019.1 & 2020.1 & 2021.1 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Total & 8,900 & 7,173 & 7,860 & 2.71 & 2.16 & 2.37 \\
+1 & Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 \\
+2 & Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 \\
+3 & Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 \\
+4 & Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 \\
+\end{longtable}
+
+Wait\ldots but now we can't differentiate betwen the ``Number of TB
+cases'' and ``TB incidence'' year columns. \texttt{pandas} has tried to
+make our lives easier by automatically adding ``.1'' to the latter
+columns, but this doesn't help us, as humans, understand the data.
+
+We can do this manually with \texttt{df.rename()}
+(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html?highlight=rename\#pandas.DataFrame.rename}{documentation}):
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{rename\_dict }\OperatorTok{=}\NormalTok{ \{}\StringTok{\textquotesingle{}2019\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB cases 2019\textquotesingle{}}\NormalTok{,}
+               \StringTok{\textquotesingle{}2020\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB cases 2020\textquotesingle{}}\NormalTok{,}
+               \StringTok{\textquotesingle{}2021\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB cases 2021\textquotesingle{}}\NormalTok{,}
+               \StringTok{\textquotesingle{}2019.1\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB incidence 2019\textquotesingle{}}\NormalTok{,}
+               \StringTok{\textquotesingle{}2020.1\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB incidence 2020\textquotesingle{}}\NormalTok{,}
+               \StringTok{\textquotesingle{}2021.1\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}TB incidence 2021\textquotesingle{}}\NormalTok{\}}
+\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ tb\_df.rename(columns}\OperatorTok{=}\NormalTok{rename\_dict)}
+\NormalTok{tb\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& U.S. jurisdiction & TB cases 2019 & TB cases 2020 & TB cases 2021 & TB
+incidence 2019 & TB incidence 2020 & TB incidence 2021 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Total & 8,900 & 7,173 & 7,860 & 2.71 & 2.16 & 2.37 \\
+1 & Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 \\
+2 & Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 \\
+3 & Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 \\
+4 & Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 \\
+\end{longtable}
+
+\subsection{Record Granularity}\label{record-granularity}
+
+You might already be wondering: what's up with that first record?
+
+Row 0 is what we call a \textbf{rollup record}, or summary record. It's
+often useful when displaying tables to humans. The \textbf{granularity}
+of record 0 (Totals) vs the rest of the records (States) is different.
+
+Okay, EDA step two. How was the rollup record aggregated?
+
+Let's check if Total TB cases is the sum of all state TB cases. If we
+sum over all rows, we should get \textbf{2x} the total cases in each of
+our TB cases by year (why do you think this is?).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_df.}\BuiltInTok{sum}\NormalTok{(axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+U.S. jurisdiction    TotalAlabamaAlaskaArizonaArkansasCaliforniaCol...
+TB cases 2019        8,9008758183642,111666718245583029973261085237...
+TB cases 2020        7,1737258136591,706525417194122219282169239376...
+TB cases 2021        7,8609258129691,750585443194992281064255127494...
+TB incidence 2019                                               109.94
+TB incidence 2020                                                93.09
+TB incidence 2021                                               102.94
+dtype: object
+\end{verbatim}
+
+Whoa, what's going on with the TB cases in 2019, 2020, and 2021? Check
+out the column types:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_df.dtypes}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+U.S. jurisdiction     object
+TB cases 2019         object
+TB cases 2020         object
+TB cases 2021         object
+TB incidence 2019    float64
+TB incidence 2020    float64
+TB incidence 2021    float64
+dtype: object
+\end{verbatim}
+
+Since there are commas in the values for TB cases, the numbers are read
+as the \texttt{object} datatype, or \textbf{storage type} (close to the
+\texttt{Python} string datatype), so \texttt{pandas} is concatenating
+strings instead of adding integers (recall that Python can ``sum'', or
+concatenate, strings together: \texttt{"data"\ +\ "100"} evaluates to
+\texttt{"data100"}).
+
+Fortunately \texttt{read\_csv} also has a \texttt{thousands} parameter
+(\href{https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html}{documentation}):
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# improve readability: chaining method calls with outer parentheses/line breaks}
+\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ (}
+\NormalTok{    pd.read\_csv(}\StringTok{"data/cdc\_tuberculosis.csv"}\NormalTok{, header}\OperatorTok{=}\DecValTok{1}\NormalTok{, thousands}\OperatorTok{=}\StringTok{\textquotesingle{},\textquotesingle{}}\NormalTok{)}
+\NormalTok{    .rename(columns}\OperatorTok{=}\NormalTok{rename\_dict)}
+\NormalTok{)}
+\NormalTok{tb\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& U.S. jurisdiction & TB cases 2019 & TB cases 2020 & TB cases 2021 & TB
+incidence 2019 & TB incidence 2020 & TB incidence 2021 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Total & 8900 & 7173 & 7860 & 2.71 & 2.16 & 2.37 \\
+1 & Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 \\
+2 & Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 \\
+3 & Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 \\
+4 & Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_df.}\BuiltInTok{sum}\NormalTok{()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+U.S. jurisdiction    TotalAlabamaAlaskaArizonaArkansasCaliforniaCol...
+TB cases 2019                                                    17800
+TB cases 2020                                                    14346
+TB cases 2021                                                    15720
+TB incidence 2019                                               109.94
+TB incidence 2020                                                93.09
+TB incidence 2021                                               102.94
+dtype: object
+\end{verbatim}
+
+The total TB cases look right. Phew!
+
+Let's just look at the records with \textbf{state-level granularity}:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{state\_tb\_df }\OperatorTok{=}\NormalTok{ tb\_df[}\DecValTok{1}\NormalTok{:]}
+\NormalTok{state\_tb\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& U.S. jurisdiction & TB cases 2019 & TB cases 2020 & TB cases 2021 & TB
+incidence 2019 & TB incidence 2020 & TB incidence 2021 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1 & Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 \\
+2 & Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 \\
+3 & Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 \\
+4 & Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 \\
+5 & California & 2111 & 1706 & 1750 & 5.35 & 4.32 & 4.46 \\
+\end{longtable}
+
+\subsection{Gather Census Data}\label{gather-census-data}
+
+U.S. Census population estimates
+\href{https://www.census.gov/data/tables/time-series/demo/popest/2010s-state-total.html}{source}
+(2019),
+\href{https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-total.html}{source}
+(2020-2021).
+
+Running the below cells cleans the data. There are a few new methods
+here: * \texttt{df.convert\_dtypes()}
+(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.convert_dtypes.html}{documentation})
+conveniently converts all float dtypes into ints and is out of scope for
+the class. * \texttt{df.drop\_na()}
+(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html}{documentation})
+will be explained in more detail next time.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# 2010s census data}
+\NormalTok{census\_2010s\_df }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/nst{-}est2019{-}01.csv"}\NormalTok{, header}\OperatorTok{=}\DecValTok{3}\NormalTok{, thousands}\OperatorTok{=}\StringTok{","}\NormalTok{)}
+\NormalTok{census\_2010s\_df }\OperatorTok{=}\NormalTok{ (}
+\NormalTok{    census\_2010s\_df}
+\NormalTok{    .reset\_index()}
+\NormalTok{    .drop(columns}\OperatorTok{=}\NormalTok{[}\StringTok{"index"}\NormalTok{, }\StringTok{"Census"}\NormalTok{, }\StringTok{"Estimates Base"}\NormalTok{])}
+\NormalTok{    .rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{"Unnamed: 0"}\NormalTok{: }\StringTok{"Geographic Area"}\NormalTok{\})}
+\NormalTok{    .convert\_dtypes()                 }\CommentTok{\# "smart" converting of columns, use at your own risk}
+\NormalTok{    .dropna()                         }\CommentTok{\# we\textquotesingle{}ll introduce this next time}
+\NormalTok{)}
+\NormalTok{census\_2010s\_df[}\StringTok{\textquotesingle{}Geographic Area\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ census\_2010s\_df[}\StringTok{\textquotesingle{}Geographic Area\textquotesingle{}}\NormalTok{].}\BuiltInTok{str}\NormalTok{.strip(}\StringTok{\textquotesingle{}.\textquotesingle{}}\NormalTok{)}
+
+\CommentTok{\# with pd.option\_context(\textquotesingle{}display.min\_rows\textquotesingle{}, 30): \# shows more rows}
+\CommentTok{\#     display(census\_2010s\_df)}
+    
+\NormalTok{census\_2010s\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllll@{}}
+\toprule\noalign{}
+& Geographic Area & 2010 & 2011 & 2012 & 2013 & 2014 & 2015 & 2016 &
+2017 & 2018 & 2019 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & United States & 309321666 & 311556874 & 313830990 & 315993715 &
+318301008 & 320635163 & 322941311 & 324985539 & 326687501 & 328239523 \\
+1 & Northeast & 55380134 & 55604223 & 55775216 & 55901806 & 56006011 &
+56034684 & 56042330 & 56059240 & 56046620 & 55982803 \\
+2 & Midwest & 66974416 & 67157800 & 67336743 & 67560379 & 67745167 &
+67860583 & 67987540 & 68126781 & 68236628 & 68329004 \\
+3 & South & 114866680 & 116006522 & 117241208 & 118364400 & 119624037 &
+120997341 & 122351760 & 123542189 & 124569433 & 125580448 \\
+4 & West & 72100436 & 72788329 & 73477823 & 74167130 & 74925793 &
+75742555 & 76559681 & 77257329 & 77834820 & 78347268 \\
+\end{longtable}
+
+Occasionally, you will want to modify code that you have imported. To
+reimport those modifications you can either use \texttt{python}'s
+\texttt{importlib} library:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ importlib }\ImportTok{import} \BuiltInTok{reload}
+\BuiltInTok{reload}\NormalTok{(utils)}
+\end{Highlighting}
+\end{Shaded}
+
+or use \texttt{iPython} magic which will intelligently import code when
+files change:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%}\NormalTok{load\_ext autoreload}
+\OperatorTok{\%}\NormalTok{autoreload }\DecValTok{2}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# census 2020s data}
+\NormalTok{census\_2020s\_df }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/NST{-}EST2022{-}POP.csv"}\NormalTok{, header}\OperatorTok{=}\DecValTok{3}\NormalTok{, thousands}\OperatorTok{=}\StringTok{","}\NormalTok{)}
+\NormalTok{census\_2020s\_df }\OperatorTok{=}\NormalTok{ (}
+\NormalTok{    census\_2020s\_df}
+\NormalTok{    .reset\_index()}
+\NormalTok{    .drop(columns}\OperatorTok{=}\NormalTok{[}\StringTok{"index"}\NormalTok{, }\StringTok{"Unnamed: 1"}\NormalTok{])}
+\NormalTok{    .rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{"Unnamed: 0"}\NormalTok{: }\StringTok{"Geographic Area"}\NormalTok{\})}
+\NormalTok{    .convert\_dtypes()                 }\CommentTok{\# "smart" converting of columns, use at your own risk}
+\NormalTok{    .dropna()                         }\CommentTok{\# we\textquotesingle{}ll introduce this next time}
+\NormalTok{)}
+\NormalTok{census\_2020s\_df[}\StringTok{\textquotesingle{}Geographic Area\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ census\_2020s\_df[}\StringTok{\textquotesingle{}Geographic Area\textquotesingle{}}\NormalTok{].}\BuiltInTok{str}\NormalTok{.strip(}\StringTok{\textquotesingle{}.\textquotesingle{}}\NormalTok{)}
+
+\NormalTok{census\_2020s\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& Geographic Area & 2020 & 2021 & 2022 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & United States & 331511512 & 332031554 & 333287557 \\
+1 & Northeast & 57448898 & 57259257 & 57040406 \\
+2 & Midwest & 68961043 & 68836505 & 68787595 \\
+3 & South & 126450613 & 127346029 & 128716192 \\
+4 & West & 78650958 & 78589763 & 78743364 \\
+\end{longtable}
+
+\subsection{\texorpdfstring{Joining Data (Merging
+\texttt{DataFrame}s)}{Joining Data (Merging DataFrames)}}\label{joining-data-merging-dataframes}
+
+Time to \texttt{merge}! Here we use the \texttt{DataFrame} method
+\texttt{df1.merge(right=df2,\ ...)} on \texttt{DataFrame} \texttt{df1}
+(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html}{documentation}).
+Contrast this with the function
+\texttt{pd.merge(left=df1,\ right=df2,\ ...)}
+(\href{https://pandas.pydata.org/docs/reference/api/pandas.merge.html?highlight=pandas\%20merge\#pandas.merge}{documentation}).
+Feel free to use either.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# merge TB DataFrame with two US census DataFrames}
+\NormalTok{tb\_census\_df }\OperatorTok{=}\NormalTok{ (}
+\NormalTok{    tb\_df}
+\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2010s\_df,}
+\NormalTok{           left\_on}\OperatorTok{=}\StringTok{"U.S. jurisdiction"}\NormalTok{, right\_on}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
+\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2020s\_df,}
+\NormalTok{           left\_on}\OperatorTok{=}\StringTok{"U.S. jurisdiction"}\NormalTok{, right\_on}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
+\NormalTok{)}
+\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllllllllllllllllllll@{}}
+\toprule\noalign{}
+& U.S. jurisdiction & TB cases 2019 & TB cases 2020 & TB cases 2021 & TB
+incidence 2019 & TB incidence 2020 & TB incidence 2021 & Geographic
+Area\_x & 2010 & 2011 & 2012 & 2013 & 2014 & 2015 & 2016 & 2017 & 2018 &
+2019 & Geographic Area\_y & 2020 & 2021 & 2022 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 & Alabama & 4785437 &
+4799069 & 4815588 & 4830081 & 4841799 & 4852347 & 4863525 & 4874486 &
+4887681 & 4903185 & Alabama & 5031362 & 5049846 & 5074296 \\
+1 & Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 & Alaska & 713910 &
+722128 & 730443 & 737068 & 736283 & 737498 & 741456 & 739700 & 735139 &
+731545 & Alaska & 732923 & 734182 & 733583 \\
+2 & Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 & Arizona & 6407172 &
+6472643 & 6554978 & 6632764 & 6730413 & 6829676 & 6941072 & 7044008 &
+7158024 & 7278717 & Arizona & 7179943 & 7264877 & 7359197 \\
+3 & Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 & Arkansas & 2921964 &
+2940667 & 2952164 & 2959400 & 2967392 & 2978048 & 2989918 & 3001345 &
+3009733 & 3017804 & Arkansas & 3014195 & 3028122 & 3045637 \\
+4 & California & 2111 & 1706 & 1750 & 5.35 & 4.32 & 4.46 & California &
+37319502 & 37638369 & 37948800 & 38260787 & 38596972 & 38918045 &
+39167117 & 39358497 & 39461588 & 39512223 & California & 39501653 &
+39142991 & 39029342 \\
+\end{longtable}
+
+Having all of these columns is a little unwieldy. We could either drop
+the unneeded columns now, or just merge on smaller census
+\texttt{DataFrame}s. Let's do the latter.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# try merging again, but cleaner this time}
+\NormalTok{tb\_census\_df }\OperatorTok{=}\NormalTok{ (}
+\NormalTok{    tb\_df}
+\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2010s\_df[[}\StringTok{"Geographic Area"}\NormalTok{, }\StringTok{"2019"}\NormalTok{]],}
+\NormalTok{           left\_on}\OperatorTok{=}\StringTok{"U.S. jurisdiction"}\NormalTok{, right\_on}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
+\NormalTok{    .drop(columns}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
+\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2020s\_df[[}\StringTok{"Geographic Area"}\NormalTok{, }\StringTok{"2020"}\NormalTok{, }\StringTok{"2021"}\NormalTok{]],}
+\NormalTok{           left\_on}\OperatorTok{=}\StringTok{"U.S. jurisdiction"}\NormalTok{, right\_on}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
+\NormalTok{    .drop(columns}\OperatorTok{=}\StringTok{"Geographic Area"}\NormalTok{)}
+\NormalTok{)}
+\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllllllll@{}}
+\toprule\noalign{}
+& U.S. jurisdiction & TB cases 2019 & TB cases 2020 & TB cases 2021 & TB
+incidence 2019 & TB incidence 2020 & TB incidence 2021 & 2019 & 2020 &
+2021 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 & 4903185 & 5031362 &
+5049846 \\
+1 & Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 & 731545 & 732923 &
+734182 \\
+2 & Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 & 7278717 & 7179943 &
+7264877 \\
+3 & Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 & 3017804 & 3014195 &
+3028122 \\
+4 & California & 2111 & 1706 & 1750 & 5.35 & 4.32 & 4.46 & 39512223 &
+39501653 & 39142991 \\
+\end{longtable}
+
+\subsection{Reproducing Data: Compute
+Incidence}\label{reproducing-data-compute-incidence}
+
+Let's recompute incidence to make sure we know where the original CDC
+numbers came from.
+
+From the
+\href{https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w\#T1_down}{CDC
+report}: TB incidence is computed as ``Cases per 100,000 persons using
+mid-year population estimates from the U.S. Census Bureau.''
+
+If we define a group as 100,000 people, then we can compute the TB
+incidence for a given state population as
+
+\[\text{TB incidence} = \frac{\text{TB cases in population}}{\text{groups in population}} = \frac{\text{TB cases in population}}{\text{population}/100000} \]
+
+\[= \frac{\text{TB cases in population}}{\text{population}} \times 100000\]
+
+Let's try this for 2019:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_census\_df[}\StringTok{"recompute incidence 2019"}\NormalTok{] }\OperatorTok{=}\NormalTok{ tb\_census\_df[}\StringTok{"TB cases 2019"}\NormalTok{]}\OperatorTok{/}\NormalTok{tb\_census\_df[}\StringTok{"2019"}\NormalTok{]}\OperatorTok{*}\DecValTok{100000}
+\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllll@{}}
+\toprule\noalign{}
+& U.S. jurisdiction & TB cases 2019 & TB cases 2020 & TB cases 2021 & TB
+incidence 2019 & TB incidence 2020 & TB incidence 2021 & 2019 & 2020 &
+2021 & recompute incidence 2019 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 & 4903185 & 5031362 &
+5049846 & 1.77 \\
+1 & Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 & 731545 & 732923 &
+734182 & 7.93 \\
+2 & Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 & 7278717 & 7179943 &
+7264877 & 2.51 \\
+3 & Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 & 3017804 & 3014195 &
+3028122 & 2.12 \\
+4 & California & 2111 & 1706 & 1750 & 5.35 & 4.32 & 4.46 & 39512223 &
+39501653 & 39142991 & 5.34 \\
+\end{longtable}
+
+Awesome!!!
+
+Let's use a for-loop and Python format strings to compute TB incidence
+for all years. Python f-strings are just used for the purposes of this
+demo, but they're handy to know when you explore data beyond this course
+(\href{https://docs.python.org/3/tutorial/inputoutput.html}{documentation}).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# recompute incidence for all years}
+\ControlFlowTok{for}\NormalTok{ year }\KeywordTok{in}\NormalTok{ [}\DecValTok{2019}\NormalTok{, }\DecValTok{2020}\NormalTok{, }\DecValTok{2021}\NormalTok{]:}
+\NormalTok{    tb\_census\_df[}\SpecialStringTok{f"recompute incidence }\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{] }\OperatorTok{=}\NormalTok{ tb\_census\_df[}\SpecialStringTok{f"TB cases }\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{]}\OperatorTok{/}\NormalTok{tb\_census\_df[}\SpecialStringTok{f"}\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{]}\OperatorTok{*}\DecValTok{100000}
+\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllllll@{}}
+\toprule\noalign{}
+& U.S. jurisdiction & TB cases 2019 & TB cases 2020 & TB cases 2021 & TB
+incidence 2019 & TB incidence 2020 & TB incidence 2021 & 2019 & 2020 &
+2021 & recompute incidence 2019 & recompute incidence 2020 & recompute
+incidence 2021 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 & 4903185 & 5031362 &
+5049846 & 1.77 & 1.43 & 1.82 \\
+1 & Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 & 731545 & 732923 &
+734182 & 7.93 & 7.91 & 7.90 \\
+2 & Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 & 7278717 & 7179943 &
+7264877 & 2.51 & 1.89 & 1.78 \\
+3 & Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 & 3017804 & 3014195 &
+3028122 & 2.12 & 1.96 & 2.28 \\
+4 & California & 2111 & 1706 & 1750 & 5.35 & 4.32 & 4.46 & 39512223 &
+39501653 & 39142991 & 5.34 & 4.32 & 4.47 \\
+\end{longtable}
+
+These numbers look pretty close!!! There are a few errors in the
+hundredths place, particularly in 2021. It may be useful to further
+explore reasons behind this discrepancy.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_census\_df.describe()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllllllllll@{}}
+\toprule\noalign{}
+& TB cases 2019 & TB cases 2020 & TB cases 2021 & TB incidence 2019 & TB
+incidence 2020 & TB incidence 2021 & 2019 & 2020 & 2021 & recompute
+incidence 2019 & recompute incidence 2020 & recompute incidence 2021 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+count & 51.00 & 51.00 & 51.00 & 51.00 & 51.00 & 51.00 & 51.00 & 51.00 &
+51.00 & 51.00 & 51.00 & 51.00 \\
+mean & 174.51 & 140.65 & 154.12 & 2.10 & 1.78 & 1.97 & 6436069.08 &
+6500225.73 & 6510422.63 & 2.10 & 1.78 & 1.97 \\
+std & 341.74 & 271.06 & 286.78 & 1.50 & 1.34 & 1.48 & 7360660.47 &
+7408168.46 & 7394300.08 & 1.50 & 1.34 & 1.47 \\
+min & 1.00 & 0.00 & 2.00 & 0.17 & 0.00 & 0.21 & 578759.00 & 577605.00 &
+579483.00 & 0.17 & 0.00 & 0.21 \\
+25\% & 25.50 & 29.00 & 23.00 & 1.29 & 1.21 & 1.23 & 1789606.00 &
+1820311.00 & 1844920.00 & 1.30 & 1.21 & 1.23 \\
+50\% & 70.00 & 67.00 & 69.00 & 1.80 & 1.52 & 1.70 & 4467673.00 &
+4507445.00 & 4506589.00 & 1.81 & 1.52 & 1.69 \\
+75\% & 180.50 & 139.00 & 150.00 & 2.58 & 1.99 & 2.22 & 7446805.00 &
+7451987.00 & 7502811.00 & 2.58 & 1.99 & 2.22 \\
+max & 2111.00 & 1706.00 & 1750.00 & 7.91 & 7.92 & 7.92 & 39512223.00 &
+39501653.00 & 39142991.00 & 7.93 & 7.91 & 7.90 \\
+\end{longtable}
+
+\subsection{Bonus EDA: Reproducing the Reported
+Statistic}\label{bonus-eda-reproducing-the-reported-statistic}
+
+\textbf{How do we reproduce that reported statistic in the original
+\href{https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w}{CDC
+report}?}
+
+\begin{quote}
+Reported TB incidence (cases per 100,000 persons) increased
+\textbf{9.4\%}, from \textbf{2.2} during 2020 to \textbf{2.4} during
+2021 but was lower than incidence during 2019 (2.7). Increases occurred
+among both U.S.-born and non--U.S.-born persons.
+\end{quote}
+
+This is TB incidence computed across the entire U.S. population! How do
+we reproduce this? * We need to reproduce the ``Total'' TB incidences in
+our rolled record. * But our current \texttt{tb\_census\_df} only has 51
+entries (50 states plus Washington, D.C.). There is no rolled record. *
+What happened\ldots?
+
+Let's get exploring!
+
+Before we keep exploring, we'll set all indexes to more meaningful
+values, instead of just numbers that pertain to some row at some point.
+This will make our cleaning slightly easier.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_df }\OperatorTok{=}\NormalTok{ tb\_df.set\_index(}\StringTok{"U.S. jurisdiction"}\NormalTok{)}
+\NormalTok{tb\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& TB cases 2019 & TB cases 2020 & TB cases 2021 & TB incidence 2019 & TB
+incidence 2020 & TB incidence 2021 \\
+U.S. jurisdiction & & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Total & 8900 & 7173 & 7860 & 2.71 & 2.16 & 2.37 \\
+Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 \\
+Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 \\
+Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 \\
+Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{census\_2010s\_df }\OperatorTok{=}\NormalTok{ census\_2010s\_df.set\_index(}\StringTok{"Geographic Area"}\NormalTok{)}
+\NormalTok{census\_2010s\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllllllll@{}}
+\toprule\noalign{}
+& 2010 & 2011 & 2012 & 2013 & 2014 & 2015 & 2016 & 2017 & 2018 & 2019 \\
+Geographic Area & & & & & & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+United States & 309321666 & 311556874 & 313830990 & 315993715 &
+318301008 & 320635163 & 322941311 & 324985539 & 326687501 & 328239523 \\
+Northeast & 55380134 & 55604223 & 55775216 & 55901806 & 56006011 &
+56034684 & 56042330 & 56059240 & 56046620 & 55982803 \\
+Midwest & 66974416 & 67157800 & 67336743 & 67560379 & 67745167 &
+67860583 & 67987540 & 68126781 & 68236628 & 68329004 \\
+South & 114866680 & 116006522 & 117241208 & 118364400 & 119624037 &
+120997341 & 122351760 & 123542189 & 124569433 & 125580448 \\
+West & 72100436 & 72788329 & 73477823 & 74167130 & 74925793 & 75742555 &
+76559681 & 77257329 & 77834820 & 78347268 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{census\_2020s\_df }\OperatorTok{=}\NormalTok{ census\_2020s\_df.set\_index(}\StringTok{"Geographic Area"}\NormalTok{)}
+\NormalTok{census\_2020s\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& 2020 & 2021 & 2022 \\
+Geographic Area & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+United States & 331511512 & 332031554 & 333287557 \\
+Northeast & 57448898 & 57259257 & 57040406 \\
+Midwest & 68961043 & 68836505 & 68787595 \\
+South & 126450613 & 127346029 & 128716192 \\
+West & 78650958 & 78589763 & 78743364 \\
+\end{longtable}
+
+It turns out that our merge above only kept state records, even though
+our original \texttt{tb\_df} had the ``Total'' rolled record:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_df.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& TB cases 2019 & TB cases 2020 & TB cases 2021 & TB incidence 2019 & TB
+incidence 2020 & TB incidence 2021 \\
+U.S. jurisdiction & & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Total & 8900 & 7173 & 7860 & 2.71 & 2.16 & 2.37 \\
+Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 \\
+Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 \\
+Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 \\
+Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 \\
+\end{longtable}
+
+Recall that \texttt{merge} by default does an \textbf{inner} merge by
+default, meaning that it only preserves keys that are present in
+\textbf{both} \texttt{DataFrame}s.
+
+The rolled records in our census \texttt{DataFrame} have different
+\texttt{Geographic\ Area} fields, which was the key we merged on:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{census\_2010s\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllllllll@{}}
+\toprule\noalign{}
+& 2010 & 2011 & 2012 & 2013 & 2014 & 2015 & 2016 & 2017 & 2018 & 2019 \\
+Geographic Area & & & & & & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+United States & 309321666 & 311556874 & 313830990 & 315993715 &
+318301008 & 320635163 & 322941311 & 324985539 & 326687501 & 328239523 \\
+Northeast & 55380134 & 55604223 & 55775216 & 55901806 & 56006011 &
+56034684 & 56042330 & 56059240 & 56046620 & 55982803 \\
+Midwest & 66974416 & 67157800 & 67336743 & 67560379 & 67745167 &
+67860583 & 67987540 & 68126781 & 68236628 & 68329004 \\
+South & 114866680 & 116006522 & 117241208 & 118364400 & 119624037 &
+120997341 & 122351760 & 123542189 & 124569433 & 125580448 \\
+West & 72100436 & 72788329 & 73477823 & 74167130 & 74925793 & 75742555 &
+76559681 & 77257329 & 77834820 & 78347268 \\
+\end{longtable}
+
+The Census \texttt{DataFrame} has several rolled records. The aggregate
+record we are looking for actually has the Geographic Area named
+``United States''.
+
+One straightforward way to get the right merge is to rename the value
+itself. Because we now have the Geographic Area index, we'll use
+\texttt{df.rename()}
+(\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html}{documentation}):
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# rename rolled record for 2010s}
+\NormalTok{census\_2010s\_df.rename(index}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}United States\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}Total\textquotesingle{}}\NormalTok{\}, inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{census\_2010s\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllllllll@{}}
+\toprule\noalign{}
+& 2010 & 2011 & 2012 & 2013 & 2014 & 2015 & 2016 & 2017 & 2018 & 2019 \\
+Geographic Area & & & & & & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Total & 309321666 & 311556874 & 313830990 & 315993715 & 318301008 &
+320635163 & 322941311 & 324985539 & 326687501 & 328239523 \\
+Northeast & 55380134 & 55604223 & 55775216 & 55901806 & 56006011 &
+56034684 & 56042330 & 56059240 & 56046620 & 55982803 \\
+Midwest & 66974416 & 67157800 & 67336743 & 67560379 & 67745167 &
+67860583 & 67987540 & 68126781 & 68236628 & 68329004 \\
+South & 114866680 & 116006522 & 117241208 & 118364400 & 119624037 &
+120997341 & 122351760 & 123542189 & 124569433 & 125580448 \\
+West & 72100436 & 72788329 & 73477823 & 74167130 & 74925793 & 75742555 &
+76559681 & 77257329 & 77834820 & 78347268 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# same, but for 2020s rename rolled record}
+\NormalTok{census\_2020s\_df.rename(index}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}United States\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}Total\textquotesingle{}}\NormalTok{\}, inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{census\_2020s\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& 2020 & 2021 & 2022 \\
+Geographic Area & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Total & 331511512 & 332031554 & 333287557 \\
+Northeast & 57448898 & 57259257 & 57040406 \\
+Midwest & 68961043 & 68836505 & 68787595 \\
+South & 126450613 & 127346029 & 128716192 \\
+West & 78650958 & 78589763 & 78743364 \\
+\end{longtable}
+
+Next let's rerun our merge. Note the different chaining, because we are
+now merging on indexes (\texttt{df.merge()}
+\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html}{documentation}).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{tb\_census\_df }\OperatorTok{=}\NormalTok{ (}
+\NormalTok{    tb\_df}
+\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2010s\_df[[}\StringTok{"2019"}\NormalTok{]],}
+\NormalTok{           left\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, right\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{    .merge(right}\OperatorTok{=}\NormalTok{census\_2020s\_df[[}\StringTok{"2020"}\NormalTok{, }\StringTok{"2021"}\NormalTok{]],}
+\NormalTok{           left\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, right\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{)}
+\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllll@{}}
+\toprule\noalign{}
+& TB cases 2019 & TB cases 2020 & TB cases 2021 & TB incidence 2019 & TB
+incidence 2020 & TB incidence 2021 & 2019 & 2020 & 2021 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Total & 8900 & 7173 & 7860 & 2.71 & 2.16 & 2.37 & 328239523 & 331511512
+& 332031554 \\
+Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 & 4903185 & 5031362 &
+5049846 \\
+Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 & 731545 & 732923 & 734182 \\
+Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 & 7278717 & 7179943 &
+7264877 \\
+Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 & 3017804 & 3014195 &
+3028122 \\
+\end{longtable}
+
+Finally, let's recompute our incidences:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# recompute incidence for all years}
+\ControlFlowTok{for}\NormalTok{ year }\KeywordTok{in}\NormalTok{ [}\DecValTok{2019}\NormalTok{, }\DecValTok{2020}\NormalTok{, }\DecValTok{2021}\NormalTok{]:}
+\NormalTok{    tb\_census\_df[}\SpecialStringTok{f"recompute incidence }\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{] }\OperatorTok{=}\NormalTok{ tb\_census\_df[}\SpecialStringTok{f"TB cases }\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{]}\OperatorTok{/}\NormalTok{tb\_census\_df[}\SpecialStringTok{f"}\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{]}\OperatorTok{*}\DecValTok{100000}
+\NormalTok{tb\_census\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllllllllll@{}}
+\toprule\noalign{}
+& TB cases 2019 & TB cases 2020 & TB cases 2021 & TB incidence 2019 & TB
+incidence 2020 & TB incidence 2021 & 2019 & 2020 & 2021 & recompute
+incidence 2019 & recompute incidence 2020 & recompute incidence 2021 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Total & 8900 & 7173 & 7860 & 2.71 & 2.16 & 2.37 & 328239523 & 331511512
+& 332031554 & 2.71 & 2.16 & 2.37 \\
+Alabama & 87 & 72 & 92 & 1.77 & 1.43 & 1.83 & 4903185 & 5031362 &
+5049846 & 1.77 & 1.43 & 1.82 \\
+Alaska & 58 & 58 & 58 & 7.91 & 7.92 & 7.92 & 731545 & 732923 & 734182 &
+7.93 & 7.91 & 7.90 \\
+Arizona & 183 & 136 & 129 & 2.51 & 1.89 & 1.77 & 7278717 & 7179943 &
+7264877 & 2.51 & 1.89 & 1.78 \\
+Arkansas & 64 & 59 & 69 & 2.12 & 1.96 & 2.28 & 3017804 & 3014195 &
+3028122 & 2.12 & 1.96 & 2.28 \\
+\end{longtable}
+
+We reproduced the total U.S. incidences correctly!
+
+We're almost there. Let's revisit the quote:
+
+\begin{quote}
+Reported TB incidence (cases per 100,000 persons) increased
+\textbf{9.4\%}, from \textbf{2.2} during 2020 to \textbf{2.4} during
+2021 but was lower than incidence during 2019 (2.7). Increases occurred
+among both U.S.-born and non--U.S.-born persons.
+\end{quote}
+
+Recall that percent change from \(A\) to \(B\) is computed as
+\(\text{percent change} = \frac{B - A}{A} \times 100\).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{incidence\_2020 }\OperatorTok{=}\NormalTok{ tb\_census\_df.loc[}\StringTok{\textquotesingle{}Total\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}recompute incidence 2020\textquotesingle{}}\NormalTok{]}
+\NormalTok{incidence\_2020}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(2.1637257652759883)
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{incidence\_2021 }\OperatorTok{=}\NormalTok{ tb\_census\_df.loc[}\StringTok{\textquotesingle{}Total\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}recompute incidence 2021\textquotesingle{}}\NormalTok{]}
+\NormalTok{incidence\_2021}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(2.3672448914298068)
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{difference }\OperatorTok{=}\NormalTok{ (incidence\_2021 }\OperatorTok{{-}}\NormalTok{ incidence\_2020)}\OperatorTok{/}\NormalTok{incidence\_2020 }\OperatorTok{*} \DecValTok{100}
+\NormalTok{difference}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(9.405957511804143)
+\end{verbatim}
+
+\section{EDA Demo 2: Mauna Loa CO2 Data -- A Lesson in Data
+Faithfulness}\label{eda-demo-2-mauna-loa-co2-data-a-lesson-in-data-faithfulness}
+
+\href{https://gml.noaa.gov/ccgg/trends/data.html}{Mauna Loa Observatory}
+has been monitoring CO2 concentrations since 1958.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{co2\_file }\OperatorTok{=} \StringTok{"data/co2\_mm\_mlo.txt"}
+\end{Highlighting}
+\end{Shaded}
+
+Let's do some \textbf{EDA}!!
+
+\subsection{\texorpdfstring{Reading this file into
+\texttt{Pandas}?}{Reading this file into Pandas?}}\label{reading-this-file-into-pandas}
+
+Let's instead check out this \texttt{.txt} file. Some questions to keep
+in mind: Do we trust this file extension? What structure is it?
+
+Lines 71-78 (inclusive) are shown below:
+
+\begin{verbatim}
+line number |                            file contents
+
+71          |   #            decimal     average   interpolated    trend    #days
+72          |   #             date                             (season corr)
+73          |   1958   3    1958.208      315.71      315.71      314.62     -1
+74          |   1958   4    1958.292      317.45      317.45      315.29     -1
+75          |   1958   5    1958.375      317.50      317.50      314.71     -1
+76          |   1958   6    1958.458      -99.99      317.10      314.85     -1
+77          |   1958   7    1958.542      315.86      315.86      314.98     -1
+78          |   1958   8    1958.625      314.93      314.93      315.94     -1
+\end{verbatim}
+
+Notice how:
+
+\begin{itemize}
+\tightlist
+\item
+  The values are separated by white space, possibly tabs.
+\item
+  The data line up down the rows. For example, the month appears in 7th
+  to 8th position of each line.
+\item
+  The 71st and 72nd lines in the file contain column headings split over
+  two lines.
+\end{itemize}
+
+We can use~\texttt{read\_csv}~to read the data into a \texttt{pandas}
+\texttt{DataFrame}, and we provide several arguments to specify that the
+separators are white space, there is no header (\textbf{we will set our
+own column names}), and to skip the first 72 rows of the file.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{co2 }\OperatorTok{=}\NormalTok{ pd.read\_csv(}
+\NormalTok{    co2\_file, header }\OperatorTok{=} \VariableTok{None}\NormalTok{, skiprows }\OperatorTok{=} \DecValTok{72}\NormalTok{,}
+\NormalTok{    sep }\OperatorTok{=} \VerbatimStringTok{r\textquotesingle{}\textbackslash{}s+\textquotesingle{}}       \CommentTok{\#delimiter for continuous whitespace (stay tuned for regex next lecture))}
+\NormalTok{)}
+\NormalTok{co2.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& 0 & 1 & 2 & 3 & 4 & 5 & 6 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1958 & 3 & 1958.21 & 315.71 & 315.71 & 314.62 & -1 \\
+1 & 1958 & 4 & 1958.29 & 317.45 & 317.45 & 315.29 & -1 \\
+2 & 1958 & 5 & 1958.38 & 317.50 & 317.50 & 314.71 & -1 \\
+3 & 1958 & 6 & 1958.46 & -99.99 & 317.10 & 314.85 & -1 \\
+4 & 1958 & 7 & 1958.54 & 315.86 & 315.86 & 314.98 & -1 \\
+\end{longtable}
+
+Congratulations! You've wrangled the data!
+
+\ldots But our columns aren't named. \textbf{We need to do more EDA.}
+
+\subsection{Exploring Variable Feature
+Types}\label{exploring-variable-feature-types}
+
+The NOAA \href{https://gml.noaa.gov/ccgg/trends/}{webpage} might have
+some useful tidbits (in this case it doesn't).
+
+Using this information, we'll rerun \texttt{pd.read\_csv}, but this time
+with some \textbf{custom column names.}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{co2 }\OperatorTok{=}\NormalTok{ pd.read\_csv(}
+\NormalTok{    co2\_file, header }\OperatorTok{=} \VariableTok{None}\NormalTok{, skiprows }\OperatorTok{=} \DecValTok{72}\NormalTok{,}
+\NormalTok{    sep }\OperatorTok{=} \StringTok{\textquotesingle{}\textbackslash{}s+\textquotesingle{}}\NormalTok{, }\CommentTok{\#regex for continuous whitespace (next lecture)}
+\NormalTok{    names }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}Yr\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Mo\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}DecDate\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Int\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Trend\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Days\textquotesingle{}}\NormalTok{]}
+\NormalTok{)}
+\NormalTok{co2.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& Yr & Mo & DecDate & Avg & Int & Trend & Days \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1958 & 3 & 1958.21 & 315.71 & 315.71 & 314.62 & -1 \\
+1 & 1958 & 4 & 1958.29 & 317.45 & 317.45 & 315.29 & -1 \\
+2 & 1958 & 5 & 1958.38 & 317.50 & 317.50 & 314.71 & -1 \\
+3 & 1958 & 6 & 1958.46 & -99.99 & 317.10 & 314.85 & -1 \\
+4 & 1958 & 7 & 1958.54 & 315.86 & 315.86 & 314.98 & -1 \\
+\end{longtable}
+
+\subsection{Visualizing CO2}\label{visualizing-co2}
+
+Scientific studies tend to have very clean data, right\ldots? Let's jump
+right in and make a time series plot of CO2 monthly averages.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.lineplot(x}\OperatorTok{=}\StringTok{\textquotesingle{}DecDate\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{co2)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{eda/eda_files/figure-pdf/cell-62-output-1.pdf}
+
+The code above uses the \texttt{seaborn} plotting library (abbreviated
+\texttt{sns}). We will cover this in the Visualization lecture, but now
+you don't need to worry about how it works!
+
+Yikes! Plotting the data uncovered a problem. The sharp vertical lines
+suggest that we have some \textbf{missing values}. What happened here?
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{co2.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& Yr & Mo & DecDate & Avg & Int & Trend & Days \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1958 & 3 & 1958.21 & 315.71 & 315.71 & 314.62 & -1 \\
+1 & 1958 & 4 & 1958.29 & 317.45 & 317.45 & 315.29 & -1 \\
+2 & 1958 & 5 & 1958.38 & 317.50 & 317.50 & 314.71 & -1 \\
+3 & 1958 & 6 & 1958.46 & -99.99 & 317.10 & 314.85 & -1 \\
+4 & 1958 & 7 & 1958.54 & 315.86 & 315.86 & 314.98 & -1 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{co2.tail()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& Yr & Mo & DecDate & Avg & Int & Trend & Days \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+733 & 2019 & 4 & 2019.29 & 413.32 & 413.32 & 410.49 & 26 \\
+734 & 2019 & 5 & 2019.38 & 414.66 & 414.66 & 411.20 & 28 \\
+735 & 2019 & 6 & 2019.46 & 413.92 & 413.92 & 411.58 & 27 \\
+736 & 2019 & 7 & 2019.54 & 411.77 & 411.77 & 411.43 & 23 \\
+737 & 2019 & 8 & 2019.62 & 409.95 & 409.95 & 411.84 & 29 \\
+\end{longtable}
+
+Some data have unusual values like -1 and -99.99.
+
+Let's check the description at the top of the file again.
+
+\begin{itemize}
+\tightlist
+\item
+  -1 signifies a missing value for the number of days \texttt{Days} the
+  equipment was in operation that month.
+\item
+  -99.99 denotes a missing monthly average \texttt{Avg}
+\end{itemize}
+
+How can we fix this? First, let's explore other aspects of our data.
+Understanding our data will help us decide what to do with the missing
+values.
+
+\subsection{Sanity Checks: Reasoning about the
+data}\label{sanity-checks-reasoning-about-the-data}
+
+First, we consider the shape of the data. How many rows should we have?
+
+\begin{itemize}
+\tightlist
+\item
+  If chronological order, we should have one record per month.
+\item
+  Data from March 1958 to August 2019.
+\item
+  We should have \$ 12 \times (2019-1957) - 2 - 4 = 738 \$ records.
+\end{itemize}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{co2.shape}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(738, 7)
+\end{verbatim}
+
+Nice!! The number of rows (i.e.~records) match our expectations.
+
+Let's now check the quality of each feature.
+
+\subsection{\texorpdfstring{Understanding Missing Value 1:
+\texttt{Days}}{Understanding Missing Value 1: Days}}\label{understanding-missing-value-1-days}
+
+\texttt{Days} is a time field, so let's analyze other time fields to see
+if there is an explanation for missing values of days of operation.
+
+Let's start with \textbf{months}, \texttt{Mo}.
+
+Are we missing any records? The number of months should have 62 or 61
+instances (March 1957-August 2019).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{co2[}\StringTok{"Mo"}\NormalTok{].value\_counts().sort\_index()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Mo
+1     61
+2     61
+3     62
+4     62
+5     62
+6     62
+7     62
+8     62
+9     61
+10    61
+11    61
+12    61
+Name: count, dtype: int64
+\end{verbatim}
+
+As expected Jan, Feb, Sep, Oct, Nov, and Dec have 61 occurrences and the
+rest 62.
+
+Next let's explore \textbf{days} \texttt{Days} itself, which is the
+number of days that the measurement equipment worked.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.displot(co2[}\StringTok{\textquotesingle{}Days\textquotesingle{}}\NormalTok{])}\OperatorTok{;}
+\NormalTok{plt.title(}\StringTok{"Distribution of days feature"}\NormalTok{)}\OperatorTok{;} \CommentTok{\# suppresses unneeded plotting output}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{eda/eda_files/figure-pdf/cell-67-output-1.pdf}
+
+In terms of data quality, a handful of months have averages based on
+measurements taken on fewer than half the days. In addition, there are
+nearly 200 missing values--\textbf{that's about 27\% of the data}!
+
+Finally, let's check the last time feature, \textbf{year} \texttt{Yr}.
+
+Let's check to see if there is any connection between missing-ness and
+the year of the recording.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.scatterplot(x}\OperatorTok{=}\StringTok{"Yr"}\NormalTok{, y}\OperatorTok{=}\StringTok{"Days"}\NormalTok{, data}\OperatorTok{=}\NormalTok{co2)}\OperatorTok{;}
+\NormalTok{plt.title(}\StringTok{"Day field by Year"}\NormalTok{)}\OperatorTok{;} \CommentTok{\# the ; suppresses output}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{eda/eda_files/figure-pdf/cell-68-output-1.pdf}
+
+\textbf{Observations}:
+
+\begin{itemize}
+\tightlist
+\item
+  All of the missing data are in the early years of operation.
+\item
+  It appears there may have been problems with equipment in the mid to
+  late 80s.
+\end{itemize}
+
+\textbf{Potential Next Steps}:
+
+\begin{itemize}
+\tightlist
+\item
+  Confirm these explanations through documentation about the historical
+  readings.
+\item
+  Maybe drop the earliest recordings? However, we would want to delay
+  such action until after we have examined the time trends and assess
+  whether there are any potential problems.
+\end{itemize}
+
+\subsection{\texorpdfstring{Understanding Missing Value 2:
+\texttt{Avg}}{Understanding Missing Value 2: Avg}}\label{understanding-missing-value-2-avg}
+
+Next, let's return to the -99.99 values in \texttt{Avg} to analyze the
+overall quality of the CO2 measurements. We'll plot a histogram of the
+average CO2 measurements
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Histograms of average CO2 measurements}
+\NormalTok{sns.displot(co2[}\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{])}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{eda/eda_files/figure-pdf/cell-69-output-1.pdf}
+
+The non-missing values are in the 300-400 range (a regular range of CO2
+levels).
+
+We also see that there are only a few missing \texttt{Avg} values
+(\textbf{\textless1\% of values}). Let's examine all of them:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{co2[co2[}\StringTok{"Avg"}\NormalTok{] }\OperatorTok{\textless{}} \DecValTok{0}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& Yr & Mo & DecDate & Avg & Int & Trend & Days \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+3 & 1958 & 6 & 1958.46 & -99.99 & 317.10 & 314.85 & -1 \\
+7 & 1958 & 10 & 1958.79 & -99.99 & 312.66 & 315.61 & -1 \\
+71 & 1964 & 2 & 1964.12 & -99.99 & 320.07 & 319.61 & -1 \\
+72 & 1964 & 3 & 1964.21 & -99.99 & 320.73 & 319.55 & -1 \\
+73 & 1964 & 4 & 1964.29 & -99.99 & 321.77 & 319.48 & -1 \\
+213 & 1975 & 12 & 1975.96 & -99.99 & 330.59 & 331.60 & 0 \\
+313 & 1984 & 4 & 1984.29 & -99.99 & 346.84 & 344.27 & 2 \\
+\end{longtable}
+
+There doesn't seem to be a pattern to these values, other than that most
+records also were missing \texttt{Days} data.
+
+\subsection{\texorpdfstring{Drop, \texttt{NaN}, or Impute Missing
+\texttt{Avg}
+Data?}{Drop, NaN, or Impute Missing Avg Data?}}\label{drop-nan-or-impute-missing-avg-data}
+
+How should we address the invalid \texttt{Avg} data?
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Drop records
+\item
+  Set to NaN
+\item
+  Impute using some strategy
+\end{enumerate}
+
+Remember we want to fix the following plot:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.lineplot(x}\OperatorTok{=}\StringTok{\textquotesingle{}DecDate\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{co2)}
+\NormalTok{plt.title(}\StringTok{"CO2 Average By Month"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{eda/eda_files/figure-pdf/cell-71-output-1.pdf}
+
+Since we are plotting \texttt{Avg} vs \texttt{DecDate}, we should just
+focus on dealing with missing values for \texttt{Avg}.
+
+Let's consider a few options: 1. Drop those records 2. Replace -99.99
+with NaN 3. Substitute it with a likely value for the average CO2?
+
+What do you think are the pros and cons of each possible action?
+
+Let's examine each of these three options.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# 1. Drop missing values}
+\NormalTok{co2\_drop }\OperatorTok{=}\NormalTok{ co2[co2[}\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{] }\OperatorTok{\textgreater{}} \DecValTok{0}\NormalTok{]}
+\NormalTok{co2\_drop.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& Yr & Mo & DecDate & Avg & Int & Trend & Days \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1958 & 3 & 1958.21 & 315.71 & 315.71 & 314.62 & -1 \\
+1 & 1958 & 4 & 1958.29 & 317.45 & 317.45 & 315.29 & -1 \\
+2 & 1958 & 5 & 1958.38 & 317.50 & 317.50 & 314.71 & -1 \\
+4 & 1958 & 7 & 1958.54 & 315.86 & 315.86 & 314.98 & -1 \\
+5 & 1958 & 8 & 1958.62 & 314.93 & 314.93 & 315.94 & -1 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# 2. Replace NaN with {-}99.99}
+\NormalTok{co2\_NA }\OperatorTok{=}\NormalTok{ co2.replace(}\OperatorTok{{-}}\FloatTok{99.99}\NormalTok{, np.nan)}
+\NormalTok{co2\_NA.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& Yr & Mo & DecDate & Avg & Int & Trend & Days \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1958 & 3 & 1958.21 & 315.71 & 315.71 & 314.62 & -1 \\
+1 & 1958 & 4 & 1958.29 & 317.45 & 317.45 & 315.29 & -1 \\
+2 & 1958 & 5 & 1958.38 & 317.50 & 317.50 & 314.71 & -1 \\
+3 & 1958 & 6 & 1958.46 & NaN & 317.10 & 314.85 & -1 \\
+4 & 1958 & 7 & 1958.54 & 315.86 & 315.86 & 314.98 & -1 \\
+\end{longtable}
+
+We'll also use a third version of the data.
+
+First, we note that the dataset already comes with a \textbf{substitute
+value} for the -99.99.
+
+From the file description:
+
+\begin{quote}
+The \texttt{interpolated} column includes average values from the
+preceding column (\texttt{average}) and \textbf{interpolated values}
+where data are missing. Interpolated values are computed in two
+steps\ldots{}
+\end{quote}
+
+The \texttt{Int} feature has values that exactly match those in
+\texttt{Avg}, except when \texttt{Avg} is -99.99, and then a
+\textbf{reasonable} estimate is used instead.
+
+So, the third version of our data will use the \texttt{Int} feature
+instead of \texttt{Avg}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# 3. Use interpolated column which estimates missing Avg values}
+\NormalTok{co2\_impute }\OperatorTok{=}\NormalTok{ co2.copy()}
+\NormalTok{co2\_impute[}\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ co2[}\StringTok{\textquotesingle{}Int\textquotesingle{}}\NormalTok{]}
+\NormalTok{co2\_impute.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& Yr & Mo & DecDate & Avg & Int & Trend & Days \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 1958 & 3 & 1958.21 & 315.71 & 315.71 & 314.62 & -1 \\
+1 & 1958 & 4 & 1958.29 & 317.45 & 317.45 & 315.29 & -1 \\
+2 & 1958 & 5 & 1958.38 & 317.50 & 317.50 & 314.71 & -1 \\
+3 & 1958 & 6 & 1958.46 & 317.10 & 317.10 & 314.85 & -1 \\
+4 & 1958 & 7 & 1958.54 & 315.86 & 315.86 & 314.98 & -1 \\
+\end{longtable}
+
+What's a \textbf{reasonable} estimate?
+
+To answer this question, let's zoom in on a short time period, say the
+measurements in 1958 (where we know we have two missing values).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# results of plotting data in 1958}
+
+\KeywordTok{def}\NormalTok{ line\_and\_points(data, ax, title):}
+    \CommentTok{\# assumes single year, hence Mo}
+\NormalTok{    ax.plot(}\StringTok{\textquotesingle{}Mo\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{data)}
+\NormalTok{    ax.scatter(}\StringTok{\textquotesingle{}Mo\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{data)}
+\NormalTok{    ax.set\_xlim(}\DecValTok{2}\NormalTok{, }\DecValTok{13}\NormalTok{)}
+\NormalTok{    ax.set\_title(title)}
+\NormalTok{    ax.set\_xticks(np.arange(}\DecValTok{3}\NormalTok{, }\DecValTok{13}\NormalTok{))}
+
+\KeywordTok{def}\NormalTok{ data\_year(data, year):}
+    \ControlFlowTok{return}\NormalTok{ data[data[}\StringTok{"Yr"}\NormalTok{] }\OperatorTok{==} \DecValTok{1958}\NormalTok{]}
+    
+\CommentTok{\# uses matplotlib subplots}
+\CommentTok{\# you may see more next week; focus on output for now}
+\NormalTok{fig, axes }\OperatorTok{=}\NormalTok{ plt.subplots(ncols }\OperatorTok{=} \DecValTok{3}\NormalTok{, figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{12}\NormalTok{, }\DecValTok{4}\NormalTok{), sharey}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+
+\NormalTok{year }\OperatorTok{=} \DecValTok{1958}
+\NormalTok{line\_and\_points(data\_year(co2\_drop, year), axes[}\DecValTok{0}\NormalTok{], title}\OperatorTok{=}\StringTok{"1. Drop Missing"}\NormalTok{)}
+\NormalTok{line\_and\_points(data\_year(co2\_NA, year), axes[}\DecValTok{1}\NormalTok{], title}\OperatorTok{=}\StringTok{"2. Missing Set to NaN"}\NormalTok{)}
+\NormalTok{line\_and\_points(data\_year(co2\_impute, year), axes[}\DecValTok{2}\NormalTok{], title}\OperatorTok{=}\StringTok{"3. Missing Interpolated"}\NormalTok{)}
+
+\NormalTok{fig.suptitle(}\SpecialStringTok{f"Monthly Averages for }\SpecialCharTok{\{}\NormalTok{year}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\NormalTok{plt.tight\_layout()}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{eda/eda_files/figure-pdf/cell-75-output-1.pdf}
+
+In the big picture since there are only 7 \texttt{Avg} values missing
+(\textbf{\textless1\%} of 738 months), any of these approaches would
+work.
+
+However there is some appeal to \textbf{option C, Imputing}:
+
+\begin{itemize}
+\tightlist
+\item
+  Shows seasonal trends for CO2
+\item
+  We are plotting all months in our data as a line plot
+\end{itemize}
+
+Let's replot our original figure with option 3:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.lineplot(x}\OperatorTok{=}\StringTok{\textquotesingle{}DecDate\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{co2\_impute)}
+\NormalTok{plt.title(}\StringTok{"CO2 Average By Month, Imputed"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{eda/eda_files/figure-pdf/cell-76-output-1.pdf}
+
+Looks pretty close to what we see on the NOAA
+\href{https://gml.noaa.gov/ccgg/trends/}{website}!
+
+\subsection{Presenting the Data: A Discussion on Data
+Granularity}\label{presenting-the-data-a-discussion-on-data-granularity}
+
+From the description:
+
+\begin{itemize}
+\tightlist
+\item
+  Monthly measurements are averages of average day measurements.
+\item
+  The NOAA GML website has datasets for daily/hourly measurements too.
+\end{itemize}
+
+The data you present depends on your research question.
+
+\textbf{How do CO2 levels vary by season?}
+
+\begin{itemize}
+\tightlist
+\item
+  You might want to keep average monthly data.
+\end{itemize}
+
+\textbf{Are CO2 levels rising over the past 50+ years, consistent with
+global warming predictions?}
+
+\begin{itemize}
+\tightlist
+\item
+  You might be happier with a \textbf{coarser granularity} of average
+  year data!
+\end{itemize}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{co2\_year }\OperatorTok{=}\NormalTok{ co2\_impute.groupby(}\StringTok{\textquotesingle{}Yr\textquotesingle{}}\NormalTok{).mean()}
+\NormalTok{sns.lineplot(x}\OperatorTok{=}\StringTok{\textquotesingle{}Yr\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}Avg\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{co2\_year)}
+\NormalTok{plt.title(}\StringTok{"CO2 Average By Year"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{eda/eda_files/figure-pdf/cell-77-output-1.pdf}
+
+Indeed, we see a rise by nearly 100 ppm of CO2 since Mauna Loa began
+recording in 1958.
+
+\section{Summary}\label{summary}
+
+We went over a lot of content this lecture; let's summarize the most
+important points:
+
+\subsection{Dealing with Missing
+Values}\label{dealing-with-missing-values}
+
+There are a few options we can take to deal with missing data:
+
+\begin{itemize}
+\tightlist
+\item
+  Drop missing records
+\item
+  Keep \texttt{NaN} missing values
+\item
+  Impute using an interpolated column
+\end{itemize}
+
+\subsection{EDA and Data Wrangling}\label{eda-and-data-wrangling}
+
+There are several ways to approach EDA and Data Wrangling:
+
+\begin{itemize}
+\tightlist
+\item
+  Examine the \textbf{data and metadata}: what is the date, size,
+  organization, and structure of the data?
+\item
+  Examine each \textbf{field/attribute/dimension} individually.
+\item
+  Examine pairs of related dimensions (e.g.~breaking down grades by
+  major).
+\item
+  Along the way, we can:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \textbf{Visualize} or summarize the data.
+  \item
+    \textbf{Validate assumptions} about data and its collection process.
+    Pay particular attention to when the data was collected.
+  \item
+    Identify and \textbf{address anomalies}.
+  \item
+    Apply data transformations and corrections (we'll cover this in the
+    upcoming lecture).
+  \item
+    \textbf{Record everything you do!} Developing in Jupyter Notebook
+    promotes \emph{reproducibility} of your own work!
+  \end{itemize}
+\end{itemize}
+
+\bookmarksetup{startatroot}
+
+\chapter{Regular Expressions}\label{regular-expressions}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Understand Python string manipulation, \texttt{pandas} \texttt{Series}
+  methods
+\item
+  Parse and create regex, with a reference table
+\item
+  Use vocabulary (closure, metacharacters, groups, etc.) to describe
+  regex metacharacters
+\end{itemize}
+
+\end{tcolorbox}
+
+\section{Why Work with Text?}\label{why-work-with-text}
+
+Last lecture, we learned of the difference between quantitative and
+qualitative variable types. The latter includes string data --- the
+primary focus of lecture 6. In this note, we'll discuss the necessary
+tools to manipulate text: Python string manipulation and regular
+expressions.
+
+There are two main reasons for working with text.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Canonicalization: Convert data that has multiple formats into a
+  standard form.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    By manipulating text, we can join tables with mismatched string
+    labels.
+  \end{itemize}
+\item
+  Extract information into a new feature.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    For example, we can extract date and time features from text.
+  \end{itemize}
+\end{enumerate}
+
+\section{Python String Methods}\label{python-string-methods}
+
+First, we'll introduce a few methods useful for string manipulation. The
+following table includes a number of string operations supported by
+Python and \texttt{pandas}. The Python functions operate on a single
+string, while their equivalent in \texttt{pandas} are
+\textbf{vectorized} --- they operate on a \texttt{Series} of string
+data.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3333}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2500}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3889}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Operation
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Python
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+\texttt{Pandas} (\texttt{Series})
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Transformation & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{s.lower()}
+\item
+  \texttt{s.upper()}
+\end{itemize}
+\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{ser.str.lower()}
+\item
+  \texttt{ser.str.upper()}
+\end{itemize}
+\end{minipage} \\
+Replacement + Deletion & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{s.replace(\_)}
+\end{itemize}
+\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{ser.str.replace(\_)}
+\end{itemize}
+\end{minipage} \\
+Split & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{s.split(\_)}
+\end{itemize}
+\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{ser.str.split(\_)}
+\end{itemize}
+\end{minipage} \\
+Substring & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{s{[}1:4{]}}
+\end{itemize}
+\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{ser.str{[}1:4{]}}
+\end{itemize}
+\end{minipage} \\
+Membership & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{\textquotesingle{}\_\textquotesingle{}\ in\ s}
+\end{itemize}
+\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{ser.str.contains(\_)}
+\end{itemize}
+\end{minipage} \\
+Length & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{len(s)}
+\end{itemize}
+\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
+\begin{itemize}
+\tightlist
+\item
+  \texttt{ser.str.len()}
+\end{itemize}
+\end{minipage} \\
+\end{longtable}
+
+We'll discuss the differences between Python string functions and
+\texttt{pandas} \texttt{Series} methods in the following section on
+canonicalization.
+
+\subsection{Canonicalization}\label{canonicalization}
+
+Assume we want to merge the given tables.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{\textquotesingle{}data/county\_and\_state.csv\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+\NormalTok{    county\_and\_state }\OperatorTok{=}\NormalTok{ pd.read\_csv(f)}
+    
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{\textquotesingle{}data/county\_and\_population.csv\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+\NormalTok{    county\_and\_pop }\OperatorTok{=}\NormalTok{ pd.read\_csv(f)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{display(county\_and\_state), display(county\_and\_pop)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& County & State \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & De Witt County & IL \\
+1 & Lac qui Parle County & MN \\
+2 & Lewis and Clark County & MT \\
+3 & St John the Baptist Parish & LS \\
+\end{longtable}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& County & Population \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & DeWitt & 16798 \\
+1 & Lac Qui Parle & 8067 \\
+2 & Lewis \& Clark & 55716 \\
+3 & St. John the Baptist & 43044 \\
+\end{longtable}
+
+Last time, we used a \textbf{primary key} and \textbf{foreign key} to
+join two tables. While neither of these keys exist in our
+\texttt{DataFrame}s, the \texttt{"County"} columns look similar enough.
+Can we convert these columns into one standard, canonical form to merge
+the two tables?
+
+\subsubsection{Canonicalization with Python String
+Manipulation}\label{canonicalization-with-python-string-manipulation}
+
+The following function uses Python string manipulation to convert a
+single county name into canonical form. It does so by eliminating
+whitespace, punctuation, and unnecessary text.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ canonicalize\_county(county\_name):}
+    \ControlFlowTok{return}\NormalTok{ (}
+\NormalTok{        county\_name}
+\NormalTok{            .lower()}
+\NormalTok{            .replace(}\StringTok{\textquotesingle{} \textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
+\NormalTok{            .replace(}\StringTok{\textquotesingle{}\&\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}and\textquotesingle{}}\NormalTok{)}
+\NormalTok{            .replace(}\StringTok{\textquotesingle{}.\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
+\NormalTok{            .replace(}\StringTok{\textquotesingle{}county\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
+\NormalTok{            .replace(}\StringTok{\textquotesingle{}parish\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
+\NormalTok{    )}
+
+\NormalTok{canonicalize\_county(}\StringTok{"St. John the Baptist"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'stjohnthebaptist'
+\end{verbatim}
+
+We will use the \texttt{pandas} \texttt{map} function to apply the
+\texttt{canonicalize\_county} function to every row in both
+\texttt{DataFrame}s. In doing so, we'll create a new column in each
+called \texttt{clean\_county\_python} with the canonical form.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{county\_and\_pop[}\StringTok{\textquotesingle{}clean\_county\_python\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ county\_and\_pop[}\StringTok{\textquotesingle{}County\textquotesingle{}}\NormalTok{].}\BuiltInTok{map}\NormalTok{(canonicalize\_county)}
+\NormalTok{county\_and\_state[}\StringTok{\textquotesingle{}clean\_county\_python\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ county\_and\_state[}\StringTok{\textquotesingle{}County\textquotesingle{}}\NormalTok{].}\BuiltInTok{map}\NormalTok{(canonicalize\_county)}
+\NormalTok{display(county\_and\_state), display(county\_and\_pop)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& County & State & clean\_county\_python \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & De Witt County & IL & dewitt \\
+1 & Lac qui Parle County & MN & lacquiparle \\
+2 & Lewis and Clark County & MT & lewisandclark \\
+3 & St John the Baptist Parish & LS & stjohnthebaptist \\
+\end{longtable}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& County & Population & clean\_county\_python \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & DeWitt & 16798 & dewitt \\
+1 & Lac Qui Parle & 8067 & lacquiparle \\
+2 & Lewis \& Clark & 55716 & lewisandclark \\
+3 & St. John the Baptist & 43044 & stjohnthebaptist \\
+\end{longtable}
+
+\subsubsection{Canonicalization with Pandas Series
+Methods}\label{canonicalization-with-pandas-series-methods}
+
+Alternatively, we can use \texttt{pandas} \texttt{Series} methods to
+create this standardized column. To do so, we must call the
+\texttt{.str} attribute of our \texttt{Series} object prior to calling
+any methods, like \texttt{.lower} and \texttt{.replace}. Notice how
+these method names match their equivalent built-in Python string
+functions.
+
+Chaining multiple \texttt{Series} methods in this manner eliminates the
+need to use the \texttt{map} function (as this code is vectorized).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ canonicalize\_county\_series(county\_series):}
+    \ControlFlowTok{return}\NormalTok{ (}
+\NormalTok{        county\_series}
+\NormalTok{            .}\BuiltInTok{str}\NormalTok{.lower()}
+\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{} \textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
+\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{}\&\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}and\textquotesingle{}}\NormalTok{)}
+\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{}.\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
+\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{}county\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
+\NormalTok{            .}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{}parish\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
+\NormalTok{    )}
+
+\NormalTok{county\_and\_pop[}\StringTok{\textquotesingle{}clean\_county\_pandas\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ canonicalize\_county\_series(county\_and\_pop[}\StringTok{\textquotesingle{}County\textquotesingle{}}\NormalTok{])}
+\NormalTok{county\_and\_state[}\StringTok{\textquotesingle{}clean\_county\_pandas\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ canonicalize\_county\_series(county\_and\_state[}\StringTok{\textquotesingle{}County\textquotesingle{}}\NormalTok{])}
+\NormalTok{display(county\_and\_pop), display(county\_and\_state)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& County & Population & clean\_county\_python & clean\_county\_pandas \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & DeWitt & 16798 & dewitt & dewitt \\
+1 & Lac Qui Parle & 8067 & lacquiparle & lacquiparle \\
+2 & Lewis \& Clark & 55716 & lewisandclark & lewisandclark \\
+3 & St. John the Baptist & 43044 & stjohnthebaptist &
+stjohnthebaptist \\
+\end{longtable}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& County & State & clean\_county\_python & clean\_county\_pandas \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & De Witt County & IL & dewitt & dewitt \\
+1 & Lac qui Parle County & MN & lacquiparle & lacquiparle \\
+2 & Lewis and Clark County & MT & lewisandclark & lewisandclark \\
+3 & St John the Baptist Parish & LS & stjohnthebaptist &
+stjohnthebaptist \\
+\end{longtable}
+
+\subsection{Extraction}\label{extraction}
+
+Extraction explores the idea of obtaining useful information from text
+data. This will be particularily important in model building, which
+we'll study in a few weeks.
+
+Say we want to read some data from a \texttt{.txt} file.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(}\StringTok{\textquotesingle{}data/log.txt\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}r\textquotesingle{}}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+\NormalTok{    log\_lines }\OperatorTok{=}\NormalTok{ f.readlines()}
+
+\NormalTok{log\_lines}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+['169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n',
+ '193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"\n',
+ '169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"\n']
+\end{verbatim}
+
+Suppose we want to extract the day, month, year, hour, minutes, seconds,
+and time zone. Unfortunately, these items are not in a fixed position
+from the beginning of the string, so slicing by some fixed offset won't
+work.
+
+Instead, we can use some clever thinking. Notice how the relevant
+information is contained within a set of brackets, further separated by
+\texttt{/} and \texttt{:}. We can hone in on this region of text, and
+split the data on these characters. Python's built-in \texttt{.split}
+function makes this easy.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{first }\OperatorTok{=}\NormalTok{ log\_lines[}\DecValTok{0}\NormalTok{] }\CommentTok{\# Only considering the first row of data}
+
+\NormalTok{pertinent }\OperatorTok{=}\NormalTok{ first.split(}\StringTok{"["}\NormalTok{)[}\DecValTok{1}\NormalTok{].split(}\StringTok{\textquotesingle{}]\textquotesingle{}}\NormalTok{)[}\DecValTok{0}\NormalTok{]}
+\NormalTok{day, month, rest }\OperatorTok{=}\NormalTok{ pertinent.split(}\StringTok{\textquotesingle{}/\textquotesingle{}}\NormalTok{)}
+\NormalTok{year, hour, minute, rest }\OperatorTok{=}\NormalTok{ rest.split(}\StringTok{\textquotesingle{}:\textquotesingle{}}\NormalTok{)}
+\NormalTok{seconds, time\_zone }\OperatorTok{=}\NormalTok{ rest.split(}\StringTok{\textquotesingle{} \textquotesingle{}}\NormalTok{)}
+\NormalTok{day, month, year, hour, minute, seconds, time\_zone}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+('26', 'Jan', '2014', '10', '47', '58', '-0800')
+\end{verbatim}
+
+There are two problems with this code:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Python's built-in functions limit us to extract data one record at a
+  time,
+
+  \begin{itemize}
+  \tightlist
+  \item
+    This can be resolved using the \texttt{map} function or
+    \texttt{pandas} \texttt{Series} methods.
+  \end{itemize}
+\item
+  The code is quite verbose.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    This is a larger issue that is trickier to solve
+  \end{itemize}
+\end{enumerate}
+
+In the next section, we'll introduce regular expressions - a tool that
+solves problem 2.
+
+\section{RegEx Basics}\label{regex-basics}
+
+A \textbf{regular expression (``RegEx'')} is a sequence of characters
+that specifies a search pattern. They are written to extract specific
+information from text. Regular expressions are essentially part of a
+smaller programming language embedded in Python, made available through
+the \texttt{re} module. As such, they have a stand-alone syntax and
+methods for various capabilities.
+
+Regular expressions are useful in many applications beyond data science.
+For example, Social Security Numbers (SSNs) are often validated with
+regular expressions.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{r"[0{-}9]\{3\}{-}[0{-}9]\{2\}{-}[0{-}9]\{4\}"} \CommentTok{\# Regular Expression Syntax}
+
+\CommentTok{\# 3 of any digit, then a dash,}
+\CommentTok{\# then 2 of any digit, then a dash,}
+\CommentTok{\# then 4 of any digit}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'[0-9]{3}-[0-9]{2}-[0-9]{4}'
+\end{verbatim}
+
+There are a ton of resources to learn and experiment with regular
+expressions. A few are provided below:
+
+\begin{itemize}
+\tightlist
+\item
+  \href{https://docs.python.org/3/howto/regex.html}{Official Regex
+  Guide}
+\item
+  \href{https://ds100.org/sp22/resources/assets/hw/regex_reference.pdf}{Data
+  100 Reference Sheet}
+\item
+  \href{https://regex101.com/}{Regex101.com}
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Be sure to check Python under the category on the left.
+  \end{itemize}
+\end{itemize}
+
+\subsection{Basics RegEx Syntax}\label{basics-regex-syntax}
+
+There are four basic operations with regular expressions.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2500}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1875}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1771}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1458}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2083}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Operation
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Order
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Syntax Example
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Matches
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Doesn't Match
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\texttt{Or}: \texttt{\textbar{}} & 4 & AA\textbar BAAB & AA BAAB & every
+other string \\
+\texttt{Concatenation} & 3 & AABAAB & AABAAB & every other string \\
+\texttt{Closure}: \texttt{*} (zero or more) & 2 & AB*A & AA ABBBBBBA &
+AB ABABA \\
+\texttt{Group}: \texttt{()} (parenthesis) & 1 & A(A\textbar B)AAB (AB)*A
+& AAAAB ABAAB A ABABABABA & every other string AA ABBA \\
+\end{longtable}
+
+Notice how these metacharacter operations are ordered. Rather than being
+literal characters, these \textbf{metacharacters} manipulate adjacent
+characters. \texttt{()} takes precedence, followed by \texttt{*}, and
+finally \texttt{\textbar{}}. This allows us to differentiate between
+very different regex commands like \texttt{AB*} and \texttt{(AB)*}. The
+former reads ``\texttt{A} then zero or more copies of \texttt{B}'',
+while the latter specifies ``zero or more copies of \texttt{AB}''.
+
+\subsubsection{Examples}\label{examples}
+
+\textbf{Question 1}: Give a regular expression that matches
+\texttt{moon}, \texttt{moooon}, etc. Your expression should match any
+even number of \texttt{o}s except zero (i.e.~don't match \texttt{mn}).
+
+\textbf{Answer 1}: \texttt{moo(oo)*n}
+
+\begin{itemize}
+\tightlist
+\item
+  Hardcoding \texttt{oo} before the capture group ensures that
+  \texttt{mn} is not matched.
+\item
+  A capture group of \texttt{(oo)*} ensures the number of \texttt{o}'s
+  is even.
+\end{itemize}
+
+\textbf{Question 2}: Using only basic operations, formulate a regex that
+matches \texttt{muun}, \texttt{muuuun}, \texttt{moon}, \texttt{moooon},
+etc. Your expression should match any even number of \texttt{u}s or
+\texttt{o}s except zero (i.e.~don't match \texttt{mn}).
+
+\textbf{Answer 2}: \texttt{m(uu(uu)*\textbar{}oo(oo)*)n}
+
+\begin{itemize}
+\tightlist
+\item
+  The leading \texttt{m} and trailing \texttt{n} ensures that only
+  strings beginning with \texttt{m} and ending with \texttt{n} are
+  matched.
+\item
+  Notice how the outer capture group surrounds the \texttt{\textbar{}}.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Consider the regex \texttt{m(uu(uu)*)\textbar{}(oo(oo)*)n}. This
+    incorrectly matches \texttt{muu} and \texttt{oooon}.
+
+    \begin{itemize}
+    \tightlist
+    \item
+      Each OR clause is everything to the left and right of
+      \texttt{\textbar{}}. The incorrect solution matches only half of
+      the string, and ignores either the beginning \texttt{m} or
+      trailing \texttt{n}.
+    \item
+      A set of parenthesis must surround \texttt{\textbar{}}. That way,
+      each OR clause is everything to the left and right of
+      \texttt{\textbar{}} \textbf{within} the group. This ensures both
+      the beginning \texttt{m} \emph{and} trailing \texttt{n} are
+      matched.
+    \end{itemize}
+  \end{itemize}
+\end{itemize}
+
+\section{RegEx Expanded}\label{regex-expanded}
+
+Provided below are more complex regular expression functions.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.4667}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1714}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1619}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1810}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Operation
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Syntax Example
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Matches
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Doesn't Match
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\texttt{Any\ Character}: \texttt{.} (except newline) & .U.U.U. & CUMULUS
+JUGULUM & SUCCUBUS TUMULTUOUS \\
+\texttt{Character\ Class}: \texttt{{[}{]}} (match one character in
+\texttt{{[}{]}}) & {[}A-Za-z{]}{[}a-z{]}* & word Capitalized & camelCase
+4illegal \\
+\texttt{Repeated\ "a"\ Times}: \texttt{\{a\}} & j{[}aeiou{]}\{3\}hn &
+jaoehn jooohn & jhn jaeiouhn \\
+\texttt{Repeated\ "from\ a\ to\ b"\ Times}: \texttt{\{a,\ b\}} &
+j{[}ou{]}\{1,2\}hn & john juohn & jhn jooohn \\
+\texttt{At\ Least\ One}: \texttt{+} & jo+hn & john joooooohn & jhn
+jjohn \\
+\texttt{Zero\ or\ One}: \texttt{?} & joh?n & jon john & any other
+string \\
+\end{longtable}
+
+A character class matches a single character in its class. These
+characters can be hardcoded ------ in the case of \texttt{{[}aeiou{]}}
+------ or shorthand can be specified to mean a range of characters.
+Examples include:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \texttt{{[}A-Z{]}}: Any capitalized letter
+\item
+  \texttt{{[}a-z{]}}: Any lowercase letter
+\item
+  \texttt{{[}0-9{]}}: Any single digit
+\item
+  \texttt{{[}A-Za-z{]}}: Any capitalized of lowercase letter
+\item
+  \texttt{{[}A-Za-z0-9{]}}: Any capitalized or lowercase letter or
+  single digit
+\end{enumerate}
+
+\subsubsection{Examples}\label{examples-1}
+
+Let's analyze a few examples of complex regular expressions.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.4722}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.4722}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Matches
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Does Not Match
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\begin{minipage}[t]{\linewidth}\raggedright
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \texttt{.*SPB.*}
+\end{enumerate}
+\end{minipage} & \\
+RASPBERRY SPBOO & SUBSPACE SUBSPECIES \\
+\begin{minipage}[t]{\linewidth}\raggedright
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{1}
+\tightlist
+\item
+  \texttt{{[}0-9{]}\{3\}-{[}0-9{]}\{2\}-{[}0-9{]}\{4\}}
+\end{enumerate}
+\end{minipage} & \\
+231-41-5121 573-57-1821 & 231415121 57-3571821 \\
+\begin{minipage}[t]{\linewidth}\raggedright
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{2}
+\tightlist
+\item
+  \texttt{{[}a-z{]}+@({[}a-z{]}+\textbackslash{}.)+(edu\textbar{}com)}
+\end{enumerate}
+\end{minipage} & \\
+horse@pizza.com horse@pizza.food.com & frank\_99@yahoo.com hug@cs \\
+\end{longtable}
+
+\textbf{Explanations}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \texttt{.*SPB.*} only matches strings that contain the substring
+  \texttt{SPB}.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    The \texttt{.*} metacharacter matches any amount of non-negative
+    characters. Newlines do not count.\\
+  \end{itemize}
+\item
+  This regular expression matches 3 of any digit, then a dash, then 2 of
+  any digit, then a dash, then 4 of any digit.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    You'll recognize this as the familiar Social Security Number regular
+    expression.
+  \end{itemize}
+\item
+  Matches any email with a \texttt{com} or \texttt{edu} domain, where
+  all characters of the email are letters.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    At least one \texttt{.} must precede the domain name. Including a
+    backslash \texttt{\textbackslash{}} before any metacharacter (in
+    this case, the \texttt{.}) tells RegEx to match that character
+    exactly.
+  \end{itemize}
+\end{enumerate}
+
+\section{Convenient RegEx}\label{convenient-regex}
+
+Here are a few more convenient regular expressions.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.4667}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1714}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1619}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1810}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Operation
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Syntax Example
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Matches
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Doesn't Match
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\texttt{built\ in\ character\ class} & \texttt{\textbackslash{}w+}
+\texttt{\textbackslash{}d+} \texttt{\textbackslash{}s+} & Fawef\_03
+231123 \texttt{whitespace} & this person 423 people
+\texttt{non-whitespace} \\
+\texttt{character\ class\ negation}: \texttt{{[}\^{}{]}} (everything
+except the given characters) & {[}\^{}a-z{]}+. & PEPPERS3982 17211!↑å &
+porch CLAmS \\
+\texttt{escape\ character}: \texttt{\textbackslash{}} (match the literal
+next character) & cow\textbackslash.com & cow.com & cowscom \\
+\texttt{beginning\ of\ line}: \texttt{\^{}} & \^{}ark & ark two ark o
+ark & dark \\
+\texttt{end\ of\ line}: \texttt{\$} & ark\$ & dark ark o ark & ark
+two \\
+\texttt{lazy\ version\ of\ zero\ or\ more} : \texttt{*?} & 5.*?5 & 5005
+55 & 5005005 \\
+\end{longtable}
+
+\subsection{Greediness}\label{greediness}
+
+In order to fully understand the last operation in the table, we have to
+discuss greediness. RegEx is greedy -- it will look for the longest
+possible match in a string. To motivate this with an example, consider
+the pattern
+\texttt{\textless{}div\textgreater{}.*\textless{}/div\textgreater{}}. In
+the sentence below, we would hope that the bolded portions would be
+matched:
+
+``This is a
+\textbf{\textless div\textgreater example\textless/div\textgreater{}} of
+greediness
+\textbf{\textless div\textgreater in\textless/div\textgreater{}} regular
+expressions.''
+
+However, in reality, RegEx captures far more of the sentence. The way
+RegEx processes the text given that pattern is as follows:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  ``Look for the exact string \textless{}\div\textgreater{}''
+\item
+  Then, ``look for any character 0 or more times''
+\item
+  Then, ``look for the exact string \textless/div\textgreater{}''
+\end{enumerate}
+
+The result would be all the characters starting from the leftmost
+\textless div\textgreater{} and the rightmost
+\textless/div\textgreater{} (inclusive):
+
+``This is a
+\textbf{\textless div\textgreater example\textless/div\textgreater{} of
+greediness \textless div\textgreater in\textless/div\textgreater{}}
+regular expressions.''
+
+We can fix this by making our pattern non-greedy,
+\texttt{\textless{}div\textgreater{}.*?\textless{}/div\textgreater{}}.
+You can read up more in the documentation
+\href{https://docs.python.org/3/howto/regex.html\#greedy-versus-non-greedy}{here}.
+
+\subsection{Examples}\label{examples-2}
+
+Let's revisit our earlier problem of extracting date/time data from the
+given \texttt{.txt} files. Here is how the data looked.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{log\_lines[}\DecValTok{0}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'
+\end{verbatim}
+
+\textbf{Question}: Give a regular expression that matches everything
+contained within and including the brackets - the day, month, year,
+hour, minutes, seconds, and time zone.
+
+\textbf{Answer}: \texttt{\textbackslash{}{[}.*\textbackslash{}{]}}
+
+\begin{itemize}
+\tightlist
+\item
+  Notice how matching the literal \texttt{{[}} and \texttt{{]}} is
+  necessary. Therefore, an escape character \texttt{\textbackslash{}} is
+  required before both \texttt{{[}} and \texttt{{]}} --- otherwise these
+  metacharacters will match character classes.
+\item
+  We need to match a particular format between \texttt{{[}} and
+  \texttt{{]}}. For this example, \texttt{.*} will suffice.
+\end{itemize}
+
+\textbf{Alternative Solution}:
+\texttt{\textbackslash{}{[}\textbackslash{}w+/\textbackslash{}w+/\textbackslash{}w+:\textbackslash{}w+:\textbackslash{}w+:\textbackslash{}w+\textbackslash{}s-\textbackslash{}w+\textbackslash{}{]}}
+
+\begin{itemize}
+\tightlist
+\item
+  This solution is much safer.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Imagine the data between \texttt{{[}} and \texttt{{]}} was garbage -
+    \texttt{.*} will still match that.
+  \item
+    The alternate solution will only match data that follows the correct
+    format.
+  \end{itemize}
+\end{itemize}
+
+\section{Regex in Python and Pandas (RegEx
+Groups)}\label{regex-in-python-and-pandas-regex-groups}
+
+\subsection{Canonicalization}\label{canonicalization-1}
+
+\subsubsection{Canonicalization with
+RegEx}\label{canonicalization-with-regex}
+
+Earlier in this note, we examined the process of canonicalization using
+\texttt{python} string manipulation and \texttt{pandas} \texttt{Series}
+methods. However, we mentioned this approach had a major flaw: our code
+was unnecessarily verbose. Equipped with our knowledge of regular
+expressions, let's fix this.
+
+To do so, we need to understand a few functions in the \texttt{re}
+module. The first of these is the substitute function:
+\texttt{re.sub(pattern,\ rep1,\ text)}. It behaves similarly to
+\texttt{python}'s built-in \texttt{.replace} function, and returns text
+with all instances of \texttt{pattern} replaced by \texttt{rep1}.
+
+The regular expression here removes text surrounded by
+\texttt{\textless{}\textgreater{}} (also known as HTML tags).
+
+In order, the pattern matches \ldots{} 1. a single \texttt{\textless{}}
+2. any character that is not a \texttt{\textgreater{}} : div, td
+valign\ldots, /td, /div 3. a single \texttt{\textgreater{}}
+
+Any substring in \texttt{text} that fulfills all three conditions will
+be replaced by \texttt{\textquotesingle{}\textquotesingle{}}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ re}
+
+\NormalTok{text }\OperatorTok{=} \StringTok{"\textless{}div\textgreater{}\textless{}td valign=\textquotesingle{}top\textquotesingle{}\textgreater{}Moo\textless{}/td\textgreater{}\textless{}/div\textgreater{}"}
+\NormalTok{pattern }\OperatorTok{=} \VerbatimStringTok{r"\textless{}[\^{}\textgreater{}]+\textgreater{}"}
+\NormalTok{re.sub(pattern, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{, text) }
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'Moo'
+\end{verbatim}
+
+Notice the \texttt{r} preceding the regular expression pattern; this
+specifies the regular expression is a raw string. Raw strings do not
+recognize escape sequences (i.e., the Python newline metacharacter
+\texttt{\textbackslash{}n}). This makes them useful for regular
+expressions, which often contain literal \texttt{\textbackslash{}}
+characters.
+
+In other words, don't forget to tag your RegEx with an \texttt{r}.
+
+\subsubsection{\texorpdfstring{Canonicalization with
+\texttt{pandas}}{Canonicalization with pandas}}\label{canonicalization-with-pandas}
+
+We can also use regular expressions with \texttt{pandas} \texttt{Series}
+methods. This gives us the benefit of operating on an entire column of
+data as opposed to a single value. The code is simple:
+\texttt{ser.str.replace(pattern,\ repl,\ regex=True}).
+
+Consider the following \texttt{DataFrame} \texttt{html\_data} with a
+single column.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{data }\OperatorTok{=}\NormalTok{ \{}\StringTok{"HTML"}\NormalTok{: [}\StringTok{"\textless{}div\textgreater{}\textless{}td valign=\textquotesingle{}top\textquotesingle{}\textgreater{}Moo\textless{}/td\textgreater{}\textless{}/div\textgreater{}"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+                 \StringTok{"\textless{}a href=\textquotesingle{}http://ds100.org\textquotesingle{}\textgreater{}Link\textless{}/a\textgreater{}"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+                 \StringTok{"\textless{}b\textgreater{}Bold text\textless{}/b\textgreater{}"}\NormalTok{]\}}
+\NormalTok{html\_data }\OperatorTok{=}\NormalTok{ pd.DataFrame(data)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{html\_data}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& HTML \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & \textless div\textgreater\textless td
+valign=\textquotesingle top\textquotesingle\textgreater Moo\textless/td\textgreater\textless/div\textgreater{} \\
+1 & \textless a
+href=\textquotesingle http://ds100.org\textquotesingle\textgreater Link\textless/a\textgreater{} \\
+2 & \textless b\textgreater Bold text\textless/b\textgreater{} \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pattern }\OperatorTok{=} \VerbatimStringTok{r"\textless{}[\^{}\textgreater{}]+\textgreater{}"}
+\NormalTok{html\_data[}\StringTok{\textquotesingle{}HTML\textquotesingle{}}\NormalTok{].}\BuiltInTok{str}\NormalTok{.replace(pattern, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{, regex}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0          Moo
+1         Link
+2    Bold text
+Name: HTML, dtype: object
+\end{verbatim}
+
+\subsection{Extraction}\label{extraction-1}
+
+\subsubsection{Extraction with RegEx}\label{extraction-with-regex}
+
+Just like with canonicalization, the \texttt{re} module provides
+capability to extract relevant text from a string:
+\texttt{re.findall(pattern,\ text)}. This function returns a list of all
+matches to \texttt{pattern}.
+
+Using the familiar regular expression for Social Security Numbers:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{text }\OperatorTok{=} \StringTok{"My social security number is 123{-}45{-}6789 bro, or maybe it’s 321{-}45{-}6789."}
+\NormalTok{pattern }\OperatorTok{=} \VerbatimStringTok{r"[0{-}9]}\SpecialCharTok{\{3\}}\VerbatimStringTok{{-}[0{-}9]}\SpecialCharTok{\{2\}}\VerbatimStringTok{{-}[0{-}9]}\SpecialCharTok{\{4\}}\VerbatimStringTok{"}
+\NormalTok{re.findall(pattern, text)  }
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+['123-45-6789', '321-45-6789']
+\end{verbatim}
+
+\subsubsection{\texorpdfstring{Extraction with
+\texttt{pandas}}{Extraction with pandas}}\label{extraction-with-pandas}
+
+\texttt{pandas} similarily provides extraction functionality on a
+\texttt{Series} of data: \texttt{ser.str.findall(pattern)}
+
+Consider the following \texttt{DataFrame} \texttt{ssn\_data}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{data }\OperatorTok{=}\NormalTok{ \{}\StringTok{"SSN"}\NormalTok{: [}\StringTok{"987{-}65{-}4321"}\NormalTok{, }\StringTok{"forty"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+                \StringTok{"123{-}45{-}6789 bro or 321{-}45{-}6789"}\NormalTok{,}
+               \StringTok{"999{-}99{-}9999"}\NormalTok{]\}}
+\NormalTok{ssn\_data }\OperatorTok{=}\NormalTok{ pd.DataFrame(data)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{ssn\_data}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& SSN \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 987-65-4321 \\
+1 & forty \\
+2 & 123-45-6789 bro or 321-45-6789 \\
+3 & 999-99-9999 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{ssn\_data[}\StringTok{"SSN"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.findall(pattern)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0                 [987-65-4321]
+1                            []
+2    [123-45-6789, 321-45-6789]
+3                 [999-99-9999]
+Name: SSN, dtype: object
+\end{verbatim}
+
+This function returns a list for every row containing the pattern
+matches in a given string.
+
+As you may expect, there are similar \texttt{pandas} equivalents for
+other \texttt{re} functions as well. \texttt{Series.str.extract} takes
+in a pattern and returns a \texttt{DataFrame} of each capture group's
+first match in the string. In contrast, \texttt{Series.str.extractall}
+returns a multi-indexed \texttt{DataFrame} of all matches for each
+capture group. You can see the difference in the outputs below:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pattern\_cg }\OperatorTok{=} \VerbatimStringTok{r"([0{-}9]}\SpecialCharTok{\{3\}}\VerbatimStringTok{){-}([0{-}9]}\SpecialCharTok{\{2\}}\VerbatimStringTok{){-}([0{-}9]}\SpecialCharTok{\{4\}}\VerbatimStringTok{)"}
+\NormalTok{ssn\_data[}\StringTok{"SSN"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.extract(pattern\_cg)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& 0 & 1 & 2 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 987 & 65 & 4321 \\
+1 & NaN & NaN & NaN \\
+2 & 123 & 45 & 6789 \\
+3 & 999 & 99 & 9999 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{ssn\_data[}\StringTok{"SSN"}\NormalTok{].}\BuiltInTok{str}\NormalTok{.extractall(pattern\_cg)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& & 0 & 1 & 2 \\
+& match & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 0 & 987 & 65 & 4321 \\
+\multirow{2}{=}{2} & 0 & 123 & 45 & 6789 \\
+& 1 & 321 & 45 & 6789 \\
+3 & 0 & 999 & 99 & 9999 \\
+\end{longtable}
+
+\subsection{Regular Expression Capture
+Groups}\label{regular-expression-capture-groups}
+
+Earlier we used parentheses \texttt{(} \texttt{)} to specify the highest
+order of operation in regular expressions. However, they have another
+meaning; parentheses are often used to represent \textbf{capture
+groups}. Capture groups are essentially, a set of smaller regular
+expressions that match multiple substrings in text data.
+
+Let's take a look at an example.
+
+\subsubsection{Example 1}\label{example-1}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{text }\OperatorTok{=} \StringTok{"Observations: 03:04:53 {-} Horse awakens. }\CharTok{\textbackslash{}}
+\StringTok{        03:05:14 {-} Horse goes back to sleep."}
+\end{Highlighting}
+\end{Shaded}
+
+Say we want to capture all occurences of time data (hour, minute, and
+second) as \emph{separate entities}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pattern\_1 }\OperatorTok{=} \VerbatimStringTok{r"(\textbackslash{}d\textbackslash{}d):(\textbackslash{}d\textbackslash{}d):(\textbackslash{}d\textbackslash{}d)"}
+\NormalTok{re.findall(pattern\_1, text)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+[('03', '04', '53'), ('03', '05', '14')]
+\end{verbatim}
+
+Notice how the given pattern has 3 capture groups, each specified by the
+regular expression \texttt{(\textbackslash{}d\textbackslash{}d)}. We
+then use \texttt{re.findall} to return these capture groups, each as
+tuples containing 3 matches.
+
+These regular expression capture groups can be different. We can use the
+\texttt{(\textbackslash{}d\{2\})} shorthand to extract the same data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pattern\_2 }\OperatorTok{=} \VerbatimStringTok{r"(\textbackslash{}d\textbackslash{}d):(\textbackslash{}d\textbackslash{}d):(\textbackslash{}d}\SpecialCharTok{\{2\}}\VerbatimStringTok{)"}
+\NormalTok{re.findall(pattern\_2, text)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+[('03', '04', '53'), ('03', '05', '14')]
+\end{verbatim}
+
+\subsubsection{Example 2}\label{example-2}
+
+With the notion of capture groups, convince yourself how the following
+regular expression works.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{first }\OperatorTok{=}\NormalTok{ log\_lines[}\DecValTok{0}\NormalTok{]}
+\NormalTok{first}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pattern }\OperatorTok{=} \VerbatimStringTok{r\textquotesingle{}\textbackslash{}[(\textbackslash{}d+)\textbackslash{}/(\textbackslash{}w+)\textbackslash{}/(\textbackslash{}d+):(\textbackslash{}d+):(\textbackslash{}d+):(\textbackslash{}d+) (.+)\textbackslash{}]\textquotesingle{}}
+\NormalTok{day, month, year, hour, minute, second, time\_zone }\OperatorTok{=}\NormalTok{ re.findall(pattern, first)[}\DecValTok{0}\NormalTok{]}
+\BuiltInTok{print}\NormalTok{(day, month, year, hour, minute, second, time\_zone)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+26 Jan 2014 10 47 58 -0800
+\end{verbatim}
+
+\section{Limitations of Regular
+Expressions}\label{limitations-of-regular-expressions}
+
+Today, we explored the capabilities of regular expressions in data
+wrangling with text data. However, there are a few things to be wary of.
+
+Writing regular expressions is like writing a program.
+
+\begin{itemize}
+\tightlist
+\item
+  Need to know the syntax well.
+\item
+  Can be easier to write than to read.
+\item
+  Can be difficult to debug.
+\end{itemize}
+
+Regular expressions are terrible at certain types of problems:
+
+\begin{itemize}
+\tightlist
+\item
+  For parsing a hierarchical structure, such as JSON, use the
+  \texttt{json.load()} parser, not RegEx!
+\item
+  Complex features (e.g.~valid email address).
+\item
+  Counting (same number of instances of a and b). (impossible)
+\item
+  Complex properties (palindromes, balanced parentheses). (impossible)
+\end{itemize}
+
+Ultimately, the goal is not to memorize all regular expressions. Rather,
+the aim is to:
+
+\begin{itemize}
+\tightlist
+\item
+  Understand what RegEx is capable of.
+\item
+  Parse and create RegEx, with a reference table
+\item
+  Use vocabulary (metacharacter, escape character, groups, etc.) to
+  describe regex metacharacters.
+\item
+  Differentiate between (), {[}{]}, \{\}
+\item
+  Design your own character classes with \d, \w, \s,
+  {[}\ldots-\ldots{]}, \^{}, etc.
+\item
+  Use \texttt{python} and \texttt{pandas} RegEx methods.
+\end{itemize}
+
+\bookmarksetup{startatroot}
+
+\chapter{Visualization I}\label{visualization-i}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Understand the theories behind effective visualizations and start to
+  generate plots of our own with \texttt{matplotlib} and
+  \texttt{seaborn}.
+\item
+  Analyze histograms and identify the skewness, potential outliers, and
+  the mode.
+\item
+  Use \texttt{boxplot} and \texttt{violinplot} to compare two
+  distributions.
+\end{itemize}
+
+\end{tcolorbox}
+
+In our journey of the data science lifecycle, we have begun to explore
+the vast world of exploratory data analysis. More recently, we learned
+how to pre-process data using various data manipulation techniques. As
+we work towards understanding our data, there is one key component
+missing in our arsenal --- the ability to visualize and discern
+relationships in existing data.
+
+These next two lectures will introduce you to various examples of data
+visualizations and their underlying theory. In doing so, we'll motivate
+their importance in real-world examples with the use of plotting
+libraries.
+
+\section{Visualizations in Data 8 and Data 100 (so
+far)}\label{visualizations-in-data-8-and-data-100-so-far}
+
+You've likely encountered several forms of data visualizations in your
+studies. You may remember two such examples from Data 8: line plots,
+scatter plots, and histograms. Each of these served a unique purpose.
+For example, line plots displayed how numerical quantities changed over
+time, while histograms were useful in understanding a variable's
+distribution.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Line Chart
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Scatter Plot
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+& \\
+\end{longtable}
+
+\begin{longtable}[]{@{}l@{}}
+\toprule\noalign{}
+Histogram \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+ \\
+\end{longtable}
+
+\section{Goals of Visualization}\label{goals-of-visualization}
+
+Visualizations are useful for a number of reasons. In Data 100, we
+consider two areas in particular:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  To broaden your understanding of the data. Summarizing trends visually
+  before in-depth analysis is a key part of exploratory data analysis.
+  Creating these graphs is a lightweight, iterative and flexible process
+  that helps us investigate relationships between variables.
+\item
+  To communicate results/conclusions to others. These visualizations are
+  highly editorial, selective, and fine-tuned to achieve a
+  communications goal, so be thoughtful and careful about its clarity,
+  accessibility, and necessary context.
+\end{enumerate}
+
+Altogether, these goals emphasize the fact that visualizations aren't a
+matter of making ``pretty'' pictures; we need to do a lot of thinking
+about what stylistic choices communicate ideas most effectively.
+
+This course note will focus on the first half of visualization topics in
+Data 100. The goal here is to understand how to choose the ``right''
+plot depending on different variable types and, secondly, how to
+generate these plots using code.
+
+\section{An Overview of
+Distributions}\label{an-overview-of-distributions}
+
+A distribution describes both the set of values that a single variable
+can take and the frequency of unique values in a single variable. For
+example, if we're interested in the distribution of students across Data
+100 discussion sections, the set of possible values is a list of
+discussion sections (10-11am, 11-12pm, etc.), and the frequency that
+each of those values occur is the number of students enrolled in each
+section. In other words, the we're interested in how a variable is
+distributed across it's possible values. Therefore, distributions must
+satisfy two properties:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  The total frequency of all categories must sum to 100\%
+\item
+  Total count should sum to the total number of datapoints if we're
+  using raw counts.
+\end{enumerate}
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Not a Valid Distribution
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Valid Distribution
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+& \\
+This is not a valid distribution since individuals can be associated
+with more than one category and the bar values demonstrate values in
+minutes and not probability. & This example satisfies the two properties
+of distributions, so it is a valid distribution. \\
+\end{longtable}
+
+\section{Variable Types Should Inform Plot
+Choice}\label{variable-types-should-inform-plot-choice}
+
+Different plots are more or less suited for displaying particular types
+of variables, laid out in the diagram below:
+
+The first step of any visualization is to identify the type(s) of
+variables we're working with. From here, we can select an appropriate
+plot type:
+
+\section{Qualitative Variables: Bar
+Plots}\label{qualitative-variables-bar-plots}
+
+A \textbf{bar plot} is one of the most common ways of displaying the
+\textbf{distribution} of a \textbf{qualitative} (categorical) variable.
+The length of a bar plot encodes the frequency of a category; the width
+encodes no useful information. The color \emph{could} indicate a
+sub-category, but this is not necessarily the case.
+
+Let's contextualize this in an example. We will use the World Bank
+dataset (\texttt{wb}) in our analysis.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+
+\NormalTok{wb }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/world\_bank.csv"}\NormalTok{, index\_col}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+\NormalTok{wb.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllllllllllllll@{}}
+\toprule\noalign{}
+& Continent & Country & Primary completion rate: Male: \% of relevant
+age group: 2015 & Primary completion rate: Female: \% of relevant age
+group: 2015 & Lower secondary completion rate: Male: \% of relevant age
+group: 2015 & Lower secondary completion rate: Female: \% of relevant
+age group: 2015 & Youth literacy rate: Male: \% of ages 15-24: 2005-14 &
+Youth literacy rate: Female: \% of ages 15-24: 2005-14 & Adult literacy
+rate: Male: \% ages 15 and older: 2005-14 & Adult literacy rate: Female:
+\% ages 15 and older: 2005-14 & ... & Access to improved sanitation
+facilities: \% of population: 1990 & Access to improved sanitation
+facilities: \% of population: 2015 & Child immunization rate: Measles:
+\% of children ages 12-23 months: 2015 & Child immunization rate: DTP3:
+\% of children ages 12-23 months: 2015 & Children with acute respiratory
+infection taken to health provider: \% of children under age 5 with ARI:
+2009-2016 & Children with diarrhea who received oral rehydration and
+continuous feeding: \% of children under age 5 with diarrhea: 2009-2016
+& Children sleeping under treated bed nets: \% of children under age 5:
+2009-2016 & Children with fever receiving antimalarial drugs: \% of
+children under age 5 with fever: 2009-2016 & Tuberculosis: Treatment
+success rate: \% of new cases: 2014 & Tuberculosis: Cases detection
+rate: \% of new estimated cases: 2015 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Africa & Algeria & 106.0 & 105.0 & 68.0 & 85.0 & 96.0 & 92.0 & 83.0
+& 68.0 & ... & 80.0 & 88.0 & 95.0 & 95.0 & 66.0 & 42.0 & NaN & NaN &
+88.0 & 80.0 \\
+1 & Africa & Angola & NaN & NaN & NaN & NaN & 79.0 & 67.0 & 82.0 & 60.0
+& ... & 22.0 & 52.0 & 55.0 & 64.0 & NaN & NaN & 25.9 & 28.3 & 34.0 &
+64.0 \\
+2 & Africa & Benin & 83.0 & 73.0 & 50.0 & 37.0 & 55.0 & 31.0 & 41.0 &
+18.0 & ... & 7.0 & 20.0 & 75.0 & 79.0 & 23.0 & 33.0 & 72.7 & 25.9 & 89.0
+& 61.0 \\
+3 & Africa & Botswana & 98.0 & 101.0 & 86.0 & 87.0 & 96.0 & 99.0 & 87.0
+& 89.0 & ... & 39.0 & 63.0 & 97.0 & 95.0 & NaN & NaN & NaN & NaN & 77.0
+& 62.0 \\
+5 & Africa & Burundi & 58.0 & 66.0 & 35.0 & 30.0 & 90.0 & 88.0 & 89.0 &
+85.0 & ... & 42.0 & 48.0 & 93.0 & 94.0 & 55.0 & 43.0 & 53.8 & 25.4 &
+91.0 & 51.0 \\
+\end{longtable}
+
+We can visualize the distribution of the \texttt{Continent} column using
+a bar plot. There are a few ways to do this.
+
+\subsection{Plotting in Pandas}\label{plotting-in-pandas}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{wb[}\StringTok{\textquotesingle{}Continent\textquotesingle{}}\NormalTok{].value\_counts().plot(kind}\OperatorTok{=}\StringTok{\textquotesingle{}bar\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-3-output-1.pdf}
+
+Recall that \texttt{.value\_counts()} returns a \texttt{Series} with the
+total count of each unique value. We call
+\texttt{.plot(kind=\textquotesingle{}bar\textquotesingle{})} on this
+result to visualize these counts as a bar plot.
+
+Plotting methods in \texttt{pandas} are the least preferred and not
+supported in Data 100, as their functionality is limited. Instead,
+future examples will focus on other libraries built specifically for
+visualizing data. The most well-known library here is
+\texttt{matplotlib}.
+
+\subsection{Plotting in Matplotlib}\label{plotting-in-matplotlib}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt }\CommentTok{\# matplotlib is typically given the alias plt}
+
+\NormalTok{continent }\OperatorTok{=}\NormalTok{ wb[}\StringTok{\textquotesingle{}Continent\textquotesingle{}}\NormalTok{].value\_counts()}
+\NormalTok{plt.bar(continent.index, continent)}
+\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Continent\textquotesingle{}}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}Count\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-4-output-1.pdf}
+
+While more code is required to achieve the same result,
+\texttt{matplotlib} is often used over \texttt{pandas} for its ability
+to plot more complex visualizations, some of which are discussed
+shortly.
+
+However, note how we needed to label the axes with \texttt{plt.xlabel}
+and \texttt{plt.ylabel}, as \texttt{matplotlib} does not support
+automatic axis labeling. To get around these inconveniences, we can use
+a more efficient plotting library: \texttt{seaborn}.
+
+\subsection{\texorpdfstring{Plotting in
+\texttt{Seaborn}}{Plotting in Seaborn}}\label{plotting-in-seaborn}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns }\CommentTok{\# seaborn is typically given the alias sns}
+\NormalTok{sns.countplot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{\textquotesingle{}Continent\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-5-output-1.pdf}
+
+In contrast to \texttt{matplotlib}, the general structure of a
+\texttt{seaborn} call involves passing in an entire \texttt{DataFrame},
+and then specifying what column(s) to plot. \texttt{seaborn.countplot}
+both counts and visualizes the number of unique values in a given
+column. This column is specified by the \texttt{x} argument to
+\texttt{sns.countplot}, while the \texttt{DataFrame} is specified by the
+\texttt{data} argument.
+
+For the vast majority of visualizations, \texttt{seaborn} is far more
+concise and aesthetically pleasing than \texttt{matplotlib}. However,
+the color scheme of this particular bar plot is arbitrary - it encodes
+no additional information about the categories themselves. This is not
+always true; color may signify meaningful detail in other
+visualizations. We'll explore this more in-depth during the next
+lecture.
+
+By now, you'll have noticed that each of these plotting libraries have a
+very different syntax. As with \texttt{pandas}, we'll teach you the
+important methods in \texttt{matplotlib} and \texttt{seaborn}, but
+you'll learn more through documentation.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \href{https://matplotlib.org/stable/index.html}{Matplotlib
+  Documentation}
+\item
+  \href{https://seaborn.pydata.org/}{Seaborn Documentation}
+\end{enumerate}
+
+\section{Distributions of Quantitative
+Variables}\label{distributions-of-quantitative-variables}
+
+Revisiting our example with the \texttt{wb} DataFrame, let's plot the
+distribution of \texttt{Gross\ national\ income\ per\ capita}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{wb.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllllllllllllll@{}}
+\toprule\noalign{}
+& Continent & Country & Primary completion rate: Male: \% of relevant
+age group: 2015 & Primary completion rate: Female: \% of relevant age
+group: 2015 & Lower secondary completion rate: Male: \% of relevant age
+group: 2015 & Lower secondary completion rate: Female: \% of relevant
+age group: 2015 & Youth literacy rate: Male: \% of ages 15-24: 2005-14 &
+Youth literacy rate: Female: \% of ages 15-24: 2005-14 & Adult literacy
+rate: Male: \% ages 15 and older: 2005-14 & Adult literacy rate: Female:
+\% ages 15 and older: 2005-14 & ... & Access to improved sanitation
+facilities: \% of population: 1990 & Access to improved sanitation
+facilities: \% of population: 2015 & Child immunization rate: Measles:
+\% of children ages 12-23 months: 2015 & Child immunization rate: DTP3:
+\% of children ages 12-23 months: 2015 & Children with acute respiratory
+infection taken to health provider: \% of children under age 5 with ARI:
+2009-2016 & Children with diarrhea who received oral rehydration and
+continuous feeding: \% of children under age 5 with diarrhea: 2009-2016
+& Children sleeping under treated bed nets: \% of children under age 5:
+2009-2016 & Children with fever receiving antimalarial drugs: \% of
+children under age 5 with fever: 2009-2016 & Tuberculosis: Treatment
+success rate: \% of new cases: 2014 & Tuberculosis: Cases detection
+rate: \% of new estimated cases: 2015 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Africa & Algeria & 106.0 & 105.0 & 68.0 & 85.0 & 96.0 & 92.0 & 83.0
+& 68.0 & ... & 80.0 & 88.0 & 95.0 & 95.0 & 66.0 & 42.0 & NaN & NaN &
+88.0 & 80.0 \\
+1 & Africa & Angola & NaN & NaN & NaN & NaN & 79.0 & 67.0 & 82.0 & 60.0
+& ... & 22.0 & 52.0 & 55.0 & 64.0 & NaN & NaN & 25.9 & 28.3 & 34.0 &
+64.0 \\
+2 & Africa & Benin & 83.0 & 73.0 & 50.0 & 37.0 & 55.0 & 31.0 & 41.0 &
+18.0 & ... & 7.0 & 20.0 & 75.0 & 79.0 & 23.0 & 33.0 & 72.7 & 25.9 & 89.0
+& 61.0 \\
+3 & Africa & Botswana & 98.0 & 101.0 & 86.0 & 87.0 & 96.0 & 99.0 & 87.0
+& 89.0 & ... & 39.0 & 63.0 & 97.0 & 95.0 & NaN & NaN & NaN & NaN & 77.0
+& 62.0 \\
+5 & Africa & Burundi & 58.0 & 66.0 & 35.0 & 30.0 & 90.0 & 88.0 & 89.0 &
+85.0 & ... & 42.0 & 48.0 & 93.0 & 94.0 & 55.0 & 43.0 & 53.8 & 25.4 &
+91.0 & 51.0 \\
+\end{longtable}
+
+How should we define our categories for this variable? In the previous
+example, these were a few unique values of the \texttt{Continent}
+column. If we use similar logic here, our categories are the different
+numerical values contained in the
+\texttt{Gross\ national\ income\ per\ capita} column.
+
+Under this assumption, let's plot this distribution using the
+\texttt{seaborn.countplot} function.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.countplot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{\textquotesingle{}Gross national income per capita, Atlas method: $: 2016\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-7-output-1.pdf}
+
+What happened? A bar plot (either \texttt{plt.bar} or
+\texttt{sns.countplot}) will create a separate bar for each unique value
+of a variable. With a continuous variable, we may not have a finite
+number of possible values, which can lead to situations like above where
+we would need many, many bars to display each unique value.
+
+Specifically, we can say this histogram suffers from
+\textbf{overplotting} as we are unable to interpret the plot and gain
+any meaningful insight.
+
+Rather than bar plots, to visualize the distribution of a continuous
+variable, we use one of the following types of plots:
+
+\begin{itemize}
+\tightlist
+\item
+  Histogram
+\item
+  Box plot
+\item
+  Violin plot
+\end{itemize}
+
+\subsection{Box Plots and Violin
+Plots}\label{box-plots-and-violin-plots}
+
+Box plots and violin plots are two very similar kinds of visualizations.
+Both display the distribution of a variable using information about
+\textbf{quartiles}.
+
+In a box plot, the width of the box at any point does not encode
+meaning. In a violin plot, the width of the plot indicates the density
+of the distribution at each possible value.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.boxplot(data}\OperatorTok{=}\NormalTok{wb, y}\OperatorTok{=}\StringTok{\textquotesingle{}Gross national income per capita, Atlas method: $: 2016\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-8-output-1.pdf}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.violinplot(data}\OperatorTok{=}\NormalTok{wb, y}\OperatorTok{=}\StringTok{"Gross national income per capita, Atlas method: $: 2016"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-9-output-1.pdf}
+
+A quartile represents a 25\% portion of the data. We say that:
+
+\begin{itemize}
+\tightlist
+\item
+  The first quartile (Q1) represents the 25th percentile -- 25\% of the
+  data is smaller than or equal to the first quartile.
+\item
+  The second quartile (Q2) represents the 50th percentile, also known as
+  the median -- 50\% of the data is smaller than or equal to the second
+  quartile.
+\item
+  The third quartile (Q3) represents the 75th percentile -- 75\% of the
+  data is smaller than or equal to the third quartile.
+\end{itemize}
+
+This means that the middle 50\% of the data lies between the first and
+third quartiles. This is demonstrated in the histogram below. The three
+quartiles are marked with red vertical bars.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{gdp }\OperatorTok{=}\NormalTok{ wb[}\StringTok{\textquotesingle{}Gross domestic product: }\SpecialCharTok{\% g}\StringTok{rowth : 2016\textquotesingle{}}\NormalTok{]}
+\NormalTok{gdp }\OperatorTok{=}\NormalTok{ gdp[}\OperatorTok{\textasciitilde{}}\NormalTok{gdp.isna()]}
+
+\NormalTok{q1, q2, q3 }\OperatorTok{=}\NormalTok{ np.percentile(gdp, [}\DecValTok{25}\NormalTok{, }\DecValTok{50}\NormalTok{, }\DecValTok{75}\NormalTok{])}
+
+\NormalTok{wb\_quartiles }\OperatorTok{=}\NormalTok{ wb.copy()}
+\NormalTok{wb\_quartiles[}\StringTok{\textquotesingle{}category\textquotesingle{}}\NormalTok{] }\OperatorTok{=} \VariableTok{None}
+\NormalTok{wb\_quartiles.loc[(wb\_quartiles[}\StringTok{\textquotesingle{}Gross domestic product: }\SpecialCharTok{\% g}\StringTok{rowth : 2016\textquotesingle{}}\NormalTok{] }\OperatorTok{\textless{}}\NormalTok{ q1) }\OperatorTok{|}\NormalTok{ (wb\_quartiles[}\StringTok{\textquotesingle{}Gross domestic product: }\SpecialCharTok{\% g}\StringTok{rowth : 2016\textquotesingle{}}\NormalTok{] }\OperatorTok{\textgreater{}}\NormalTok{ q3), }\StringTok{\textquotesingle{}category\textquotesingle{}}\NormalTok{] }\OperatorTok{=} \StringTok{\textquotesingle{}Outside of the middle 50\%\textquotesingle{}}
+\NormalTok{wb\_quartiles.loc[(wb\_quartiles[}\StringTok{\textquotesingle{}Gross domestic product: }\SpecialCharTok{\% g}\StringTok{rowth : 2016\textquotesingle{}}\NormalTok{] }\OperatorTok{\textgreater{}}\NormalTok{ q1) }\OperatorTok{\&}\NormalTok{ (wb\_quartiles[}\StringTok{\textquotesingle{}Gross domestic product: }\SpecialCharTok{\% g}\StringTok{rowth : 2016\textquotesingle{}}\NormalTok{] }\OperatorTok{\textless{}}\NormalTok{ q3), }\StringTok{\textquotesingle{}category\textquotesingle{}}\NormalTok{] }\OperatorTok{=} \StringTok{\textquotesingle{}In the middle 50\%\textquotesingle{}}
+
+\NormalTok{sns.histplot(wb\_quartiles, x}\OperatorTok{=}\StringTok{"Gross domestic product: }\SpecialCharTok{\% g}\StringTok{rowth : 2016"}\NormalTok{, hue}\OperatorTok{=}\StringTok{"category"}\NormalTok{)}
+\NormalTok{sns.rugplot([q1, q2, q3], c}\OperatorTok{=}\StringTok{"firebrick"}\NormalTok{, lw}\OperatorTok{=}\DecValTok{6}\NormalTok{, height}\OperatorTok{=}\FloatTok{0.1}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-10-output-1.pdf}
+
+In a box plot, the lower extent of the box lies at Q1, while the upper
+extent of the box lies at Q3. The horizontal line in the middle of the
+box corresponds to Q2 (equivalently, the median).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.boxplot(data}\OperatorTok{=}\NormalTok{wb, y}\OperatorTok{=}\StringTok{\textquotesingle{}Gross domestic product: }\SpecialCharTok{\% g}\StringTok{rowth : 2016\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-11-output-1.pdf}
+
+The \textbf{whiskers} of a box-plot are the two points that lie at the
+{[}\(1^{st}\) Quartile \(-\) (\(1.5\times\) IQR){]}, and the
+{[}\(3^{rd}\) Quartile \(+\) (\(1.5\times\) IQR){]}. They are the lower
+and upper ranges of ``normal'' data (the points excluding outliers).
+
+The different forms of information contained in a box plot can be
+summarised as follows:
+
+A violin plot displays quartile information, albeit a bit more subtly
+through smoothed density curves. Look closely at the center vertical bar
+of the violin plot below; the three quartiles and ``whiskers'' are still
+present!
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.violinplot(data}\OperatorTok{=}\NormalTok{wb, y}\OperatorTok{=}\StringTok{\textquotesingle{}Gross domestic product: }\SpecialCharTok{\% g}\StringTok{rowth : 2016\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-12-output-1.pdf}
+
+\subsection{Side-by-Side Box and Violin
+Plots}\label{side-by-side-box-and-violin-plots}
+
+Plotting side-by-side box or violin plots allows us to compare
+distributions across different categories. In other words, they enable
+us to plot both a qualitative variable and a quantitative continuous
+variable in one visualization.
+
+With \texttt{seaborn}, we can easily create side-by-side plots by
+specifying both an x and y column.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.boxplot(data}\OperatorTok{=}\NormalTok{wb, x}\OperatorTok{=}\StringTok{"Continent"}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}Gross domestic product: }\SpecialCharTok{\% g}\StringTok{rowth : 2016\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-13-output-1.pdf}
+
+\subsection{Histograms}\label{histograms}
+
+You are likely familiar with histograms from Data 8. A histogram
+collects continuous data into bins, then plots this binned data. Each
+bin reflects the density of datapoints with values that lie between the
+left and right ends of the bin; in other words, the \textbf{area} of
+each bin is proportional to the \textbf{percentage} of datapoints it
+contains.
+
+\subsubsection{Plotting Histograms}\label{plotting-histograms}
+
+Below, we plot a histogram using matplotlib and seaborn. Which graph do
+you prefer?
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# The \textasciigrave{}edgecolor\textasciigrave{} argument controls the color of the bin edges}
+\NormalTok{gni }\OperatorTok{=}\NormalTok{ wb[}\StringTok{"Gross national income per capita, Atlas method: $: 2016"}\NormalTok{]}
+\NormalTok{plt.hist(gni, density}\OperatorTok{=}\VariableTok{True}\NormalTok{, edgecolor}\OperatorTok{=}\StringTok{"white"}\NormalTok{)}
+
+\CommentTok{\# Add labels}
+\NormalTok{plt.xlabel(}\StringTok{"Gross national income per capita"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Density"}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"Distribution of gross national income per capita"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-14-output-1.pdf}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.histplot(data}\OperatorTok{=}\NormalTok{wb, x}\OperatorTok{=}\StringTok{"Gross national income per capita, Atlas method: $: 2016"}\NormalTok{, stat}\OperatorTok{=}\StringTok{"density"}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"Distribution of gross national income per capita"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-15-output-1.pdf}
+
+\subsubsection{Overlaid Histograms}\label{overlaid-histograms}
+
+We can overlay histograms (or density curves) to compare distributions
+across qualitative categories.
+
+The \texttt{hue} parameter of \texttt{sns.histplot} specifies the column
+that should be used to determine the color of each category.
+\texttt{hue} can be used in many \texttt{seaborn} plotting functions.
+
+Notice that the resulting plot includes a legend describing which color
+corresponds to each hemisphere -- a legend should always be included if
+color is used to encode information in a visualization!
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Create a new variable to store the hemisphere in which each country is located}
+\NormalTok{north }\OperatorTok{=}\NormalTok{ [}\StringTok{"Asia"}\NormalTok{, }\StringTok{"Europe"}\NormalTok{, }\StringTok{"N. America"}\NormalTok{]}
+\NormalTok{south }\OperatorTok{=}\NormalTok{ [}\StringTok{"Africa"}\NormalTok{, }\StringTok{"Oceania"}\NormalTok{, }\StringTok{"S. America"}\NormalTok{]}
+\NormalTok{wb.loc[wb[}\StringTok{"Continent"}\NormalTok{].isin(north), }\StringTok{"Hemisphere"}\NormalTok{] }\OperatorTok{=} \StringTok{"Northern"}
+\NormalTok{wb.loc[wb[}\StringTok{"Continent"}\NormalTok{].isin(south), }\StringTok{"Hemisphere"}\NormalTok{] }\OperatorTok{=} \StringTok{"Southern"}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.histplot(data}\OperatorTok{=}\NormalTok{wb, x}\OperatorTok{=}\StringTok{"Gross national income per capita, Atlas method: $: 2016"}\NormalTok{, hue}\OperatorTok{=}\StringTok{"Hemisphere"}\NormalTok{, stat}\OperatorTok{=}\StringTok{"density"}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"Distribution of gross national income per capita"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-17-output-1.pdf}
+
+Again, each bin of a histogram is scaled such that its \textbf{area} is
+proportional to the \textbf{percentage} of all datapoints that it
+contains.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{densities, bins, \_ }\OperatorTok{=}\NormalTok{ plt.hist(gni, density}\OperatorTok{=}\VariableTok{True}\NormalTok{, edgecolor}\OperatorTok{=}\StringTok{"white"}\NormalTok{, bins}\OperatorTok{=}\DecValTok{5}\NormalTok{)}
+\NormalTok{plt.xlabel(}\StringTok{"Gross national income per capita"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Density"}\NormalTok{)}
+
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"First bin has width }\SpecialCharTok{\{}\NormalTok{bins[}\DecValTok{1}\NormalTok{]}\OperatorTok{{-}}\NormalTok{bins[}\DecValTok{0}\NormalTok{]}\SpecialCharTok{\}}\SpecialStringTok{ and height }\SpecialCharTok{\{}\NormalTok{densities[}\DecValTok{0}\NormalTok{]}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"This corresponds to }\SpecialCharTok{\{}\NormalTok{bins[}\DecValTok{1}\NormalTok{]}\OperatorTok{{-}}\NormalTok{bins[}\DecValTok{0}\NormalTok{]}\SpecialCharTok{\}}\SpecialStringTok{ * }\SpecialCharTok{\{}\NormalTok{densities[}\DecValTok{0}\NormalTok{]}\SpecialCharTok{\}}\SpecialStringTok{ = }\SpecialCharTok{\{}\NormalTok{(bins[}\DecValTok{1}\NormalTok{]}\OperatorTok{{-}}\NormalTok{bins[}\DecValTok{0}\NormalTok{])}\OperatorTok{*}\NormalTok{densities[}\DecValTok{0}\NormalTok{]}\OperatorTok{*}\DecValTok{100}\SpecialCharTok{\}}\SpecialStringTok{\% of the data"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+First bin has width 16410.0 and height 4.7741589911386953e-05
+This corresponds to 16410.0 * 4.7741589911386953e-05 = 78.343949044586% of the data
+\end{verbatim}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-18-output-2.pdf}
+
+\subsubsection{Evaluating Histograms}\label{evaluating-histograms}
+
+Histograms allow us to assess a distribution by their shape. There are a
+few properties of histograms we can analyze:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Skewness and Tails
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Skewed left vs skewed right
+  \item
+    Left tail vs right tail
+  \end{itemize}
+\item
+  Outliers
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Using percentiles
+  \end{itemize}
+\item
+  Modes
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Most commonly occuring data
+  \end{itemize}
+\end{enumerate}
+
+\paragraph{Skewness and Tails}\label{skewness-and-tails}
+
+The skew of a histogram describes the direction in which its ``tail''
+extends. - A distribution with a long right tail is \textbf{skewed
+right} (such as \texttt{Gross\ national\ income\ per\ capita}). In a
+right-skewed distribution, the few large outliers ``pull'' the mean to
+the \textbf{right} of the median.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{\textquotesingle{}Gross national income per capita, Atlas method: $: 2016\textquotesingle{}}\NormalTok{, stat }\OperatorTok{=} \StringTok{\textquotesingle{}density\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\NormalTok{plt.title(}\StringTok{\textquotesingle{}Distribution with a long right tail\textquotesingle{}}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Text(0.5, 1.0, 'Distribution with a long right tail')
+\end{verbatim}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-19-output-2.pdf}
+
+\begin{itemize}
+\tightlist
+\item
+  A distribution with a long left tail is \textbf{skewed left} (such as
+  \texttt{Access\ to\ an\ improved\ water\ source}). In a left-skewed
+  distribution, the few small outliers ``pull'' the mean to the
+  \textbf{left} of the median.
+\end{itemize}
+
+In the case where a distribution has equal-sized right and left tails,
+it is \textbf{symmetric}. The mean is approximately \textbf{equal} to
+the median. Think of mean as the balancing point of the distribution.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.histplot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{\textquotesingle{}Access to an improved water source: }\SpecialCharTok{\% o}\StringTok{f population: 2015\textquotesingle{}}\NormalTok{, stat }\OperatorTok{=} \StringTok{\textquotesingle{}density\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\NormalTok{plt.title(}\StringTok{\textquotesingle{}Distribution with a long left tail\textquotesingle{}}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Text(0.5, 1.0, 'Distribution with a long left tail')
+\end{verbatim}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-20-output-2.pdf}
+
+\paragraph{Outliers}\label{outliers}
+
+Loosely speaking, an \textbf{outlier} is defined as a data point that
+lies an abnormally large distance away from other values. Let's make
+this more concrete. As you may have observed in the box plot infographic
+earlier, we define \textbf{outliers} to be the data points that fall
+beyond the whiskers. Specifically, values that are less than the
+{[}\(1^{st}\) Quartile \(-\) (\(1.5\times\) IQR){]}, or greater than
+{[}\(3^{rd}\) Quartile \(+\) (\(1.5\times\) IQR).{]}
+
+\paragraph{Modes}\label{modes}
+
+In Data 100, we describe a ``mode'' of a histogram as a peak in the
+distribution. Often, however, it is difficult to determine what counts
+as its own ``peak.'' For example, the number of peaks in the
+distribution of HIV rates across different countries varies depending on
+the number of histogram bins we plot.
+
+If we set the number of bins to 5, the distribution appears unimodal.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Rename the very long column name for convenience}
+\NormalTok{wb }\OperatorTok{=}\NormalTok{ wb.rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}Antiretroviral therapy coverage: }\SpecialCharTok{\% o}\StringTok{f people living with HIV: 2015\textquotesingle{}}\NormalTok{:}\StringTok{"HIV rate"}\NormalTok{\})}
+\CommentTok{\# With 5 bins, it seems that there is only one peak}
+\NormalTok{sns.histplot(data}\OperatorTok{=}\NormalTok{wb, x}\OperatorTok{=}\StringTok{"HIV rate"}\NormalTok{, stat}\OperatorTok{=}\StringTok{"density"}\NormalTok{, bins}\OperatorTok{=}\DecValTok{5}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"5 histogram bins"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-21-output-1.pdf}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# With 10 bins, there seem to be two peaks}
+
+\NormalTok{sns.histplot(data}\OperatorTok{=}\NormalTok{wb, x}\OperatorTok{=}\StringTok{"HIV rate"}\NormalTok{, stat}\OperatorTok{=}\StringTok{"density"}\NormalTok{, bins}\OperatorTok{=}\DecValTok{10}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"10 histogram bins"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-22-output-1.pdf}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# And with 20 bins, it becomes hard to say what counts as a "peak"!}
+
+\NormalTok{sns.histplot(data}\OperatorTok{=}\NormalTok{wb, x }\OperatorTok{=}\StringTok{"HIV rate"}\NormalTok{, stat}\OperatorTok{=}\StringTok{"density"}\NormalTok{, bins}\OperatorTok{=}\DecValTok{20}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"20 histogram bins"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_1/visualization_1_files/figure-pdf/cell-23-output-1.pdf}
+
+In part, it is these ambiguities that motivate us to consider using
+Kernel Density Estimation (KDE), which we will explore more in the next
+lecture.
+
+\bookmarksetup{startatroot}
+
+\chapter{Visualization II}\label{visualization-ii}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Understanding KDE for plotting distributions and estimating density
+  curves.
+\item
+  Using transformations to analyze the relationship between two
+  variables.
+\item
+  Evaluating the quality of a visualization based on visualization
+  theory concepts.
+\end{itemize}
+
+\end{tcolorbox}
+
+\section{Kernel Density Estimation}\label{kernel-density-estimation}
+
+Often, we want to identify general trends across a distribution, rather
+than focus on detail. Smoothing a distribution helps generalize the
+structure of the data and eliminate noise.
+
+\subsection{KDE Theory}\label{kde-theory}
+
+A \textbf{kernel density estimate (KDE)} is a smooth, continuous
+function that approximates a curve. It allows us to represent general
+trends in a distribution without focusing on the details, which is
+useful for analyzing the broad structure of a dataset.
+
+More formally, a KDE attempts to approximate the underlying
+\textbf{probability distribution} from which our dataset was drawn. You
+may have encountered the idea of a probability distribution in your
+other classes; if not, we'll discuss it at length in the next lecture.
+For now, you can think of a probability distribution as a description of
+how likely it is for us to sample a particular value in our dataset.
+
+A KDE curve estimates the probability density function of a random
+variable. Consider the example below, where we have used
+\texttt{sns.displot} to plot both a histogram (containing the data
+points we actually collected) and a KDE curve (representing the
+\emph{approximated} probability distribution from which this data was
+drawn) using data from the World Bank dataset (\texttt{wb}).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+
+\NormalTok{wb }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/world\_bank.csv"}\NormalTok{, index\_col}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+\NormalTok{wb }\OperatorTok{=}\NormalTok{ wb.rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}Antiretroviral therapy coverage: }\SpecialCharTok{\% o}\StringTok{f people living with HIV: 2015\textquotesingle{}}\NormalTok{:}\StringTok{"HIV rate"}\NormalTok{,}
+                       \StringTok{\textquotesingle{}Gross national income per capita, Atlas method: $: 2016\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}gni\textquotesingle{}}\NormalTok{\})}
+\NormalTok{wb.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllllllllllllll@{}}
+\toprule\noalign{}
+& Continent & Country & Primary completion rate: Male: \% of relevant
+age group: 2015 & Primary completion rate: Female: \% of relevant age
+group: 2015 & Lower secondary completion rate: Male: \% of relevant age
+group: 2015 & Lower secondary completion rate: Female: \% of relevant
+age group: 2015 & Youth literacy rate: Male: \% of ages 15-24: 2005-14 &
+Youth literacy rate: Female: \% of ages 15-24: 2005-14 & Adult literacy
+rate: Male: \% ages 15 and older: 2005-14 & Adult literacy rate: Female:
+\% ages 15 and older: 2005-14 & ... & Access to improved sanitation
+facilities: \% of population: 1990 & Access to improved sanitation
+facilities: \% of population: 2015 & Child immunization rate: Measles:
+\% of children ages 12-23 months: 2015 & Child immunization rate: DTP3:
+\% of children ages 12-23 months: 2015 & Children with acute respiratory
+infection taken to health provider: \% of children under age 5 with ARI:
+2009-2016 & Children with diarrhea who received oral rehydration and
+continuous feeding: \% of children under age 5 with diarrhea: 2009-2016
+& Children sleeping under treated bed nets: \% of children under age 5:
+2009-2016 & Children with fever receiving antimalarial drugs: \% of
+children under age 5 with fever: 2009-2016 & Tuberculosis: Treatment
+success rate: \% of new cases: 2014 & Tuberculosis: Cases detection
+rate: \% of new estimated cases: 2015 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Africa & Algeria & 106.0 & 105.0 & 68.0 & 85.0 & 96.0 & 92.0 & 83.0
+& 68.0 & ... & 80.0 & 88.0 & 95.0 & 95.0 & 66.0 & 42.0 & NaN & NaN &
+88.0 & 80.0 \\
+1 & Africa & Angola & NaN & NaN & NaN & NaN & 79.0 & 67.0 & 82.0 & 60.0
+& ... & 22.0 & 52.0 & 55.0 & 64.0 & NaN & NaN & 25.9 & 28.3 & 34.0 &
+64.0 \\
+2 & Africa & Benin & 83.0 & 73.0 & 50.0 & 37.0 & 55.0 & 31.0 & 41.0 &
+18.0 & ... & 7.0 & 20.0 & 75.0 & 79.0 & 23.0 & 33.0 & 72.7 & 25.9 & 89.0
+& 61.0 \\
+3 & Africa & Botswana & 98.0 & 101.0 & 86.0 & 87.0 & 96.0 & 99.0 & 87.0
+& 89.0 & ... & 39.0 & 63.0 & 97.0 & 95.0 & NaN & NaN & NaN & NaN & 77.0
+& 62.0 \\
+5 & Africa & Burundi & 58.0 & 66.0 & 35.0 & 30.0 & 90.0 & 88.0 & 89.0 &
+85.0 & ... & 42.0 & 48.0 & 93.0 & 94.0 & 55.0 & 43.0 & 53.8 & 25.4 &
+91.0 & 51.0 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+
+\NormalTok{sns.displot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{\textquotesingle{}HIV rate\textquotesingle{}}\NormalTok{, }\OperatorTok{\textbackslash{}}
+\NormalTok{                       kde }\OperatorTok{=} \VariableTok{True}\NormalTok{, stat }\OperatorTok{=} \StringTok{"density"}\NormalTok{)}
+
+\NormalTok{plt.title(}\StringTok{"Distribution of HIV rates"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-3-output-1.pdf}
+
+Notice that the smooth KDE curve is higher when the histogram bins are
+taller. You can think of the height of the KDE curve as representing how
+``probable'' it is that we randomly sample a datapoint with the
+corresponding value. This intuitively makes sense -- if we have already
+collected more datapoints with a particular value (resulting in a tall
+histogram bin), it is more likely that, if we randomly sample another
+datapoint, we will sample one with a similar value (resulting in a high
+KDE curve).
+
+The area under a probability density function should always integrate to
+1, representing the fact that the total probability of a distribution
+should always sum to 100\%. Hence, a KDE curve will always have an area
+under the curve of 1.
+
+\subsection{Constructing a KDE}\label{constructing-a-kde}
+
+We perform kernel density estimation using three steps.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Place a kernel at each datapoint.
+\item
+  Normalize the kernels to have a total area of 1 (across all kernels).
+\item
+  Sum the normalized kernels.
+\end{enumerate}
+
+We'll explain what a ``kernel'' is momentarily.
+
+To make things simpler, let's construct a KDE for a small, artificially
+generated dataset of 5 datapoints: \([2.2, 2.8, 3.7, 5.3, 5.7]\). In the
+plot below, each vertical bar represents one data point.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{data }\OperatorTok{=}\NormalTok{ [}\FloatTok{2.2}\NormalTok{, }\FloatTok{2.8}\NormalTok{, }\FloatTok{3.7}\NormalTok{, }\FloatTok{5.3}\NormalTok{, }\FloatTok{5.7}\NormalTok{]}
+
+\NormalTok{sns.rugplot(data, height}\OperatorTok{=}\FloatTok{0.3}\NormalTok{)}
+
+\NormalTok{plt.xlabel(}\StringTok{"Data"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Density"}\NormalTok{)}
+\NormalTok{plt.xlim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{10}\NormalTok{)}
+\NormalTok{plt.ylim(}\DecValTok{0}\NormalTok{, }\FloatTok{0.5}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-4-output-1.pdf}
+
+Our goal is to create the following KDE curve, which was generated
+automatically by \texttt{sns.kdeplot}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.kdeplot(data)}
+
+\NormalTok{plt.xlabel(}\StringTok{"Data"}\NormalTok{)}
+\NormalTok{plt.xlim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{10}\NormalTok{)}
+\NormalTok{plt.ylim(}\DecValTok{0}\NormalTok{, }\FloatTok{0.5}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-5-output-1.pdf}
+
+\subsubsection{Step 1: Place a Kernel at Each Data
+Point}\label{step-1-place-a-kernel-at-each-data-point}
+
+To begin generating a density curve, we need to choose a \textbf{kernel}
+and \textbf{bandwidth value (\(\alpha\))}. What are these exactly?
+
+A \textbf{kernel} is a density curve. It is the mathematical function
+that attempts to capture the randomness of each data point in our
+sampled data. To explain what this means, consider just \emph{one} of
+the datapoints in our dataset: \(2.2\). We obtained this datapoint by
+randomly sampling some information out in the real world (you can
+imagine \(2.2\) as representing a single measurement taken in an
+experiment, for example). If we were to sample a new datapoint, we may
+obtain a slightly different value. It could be higher than \(2.2\); it
+could also be lower than \(2.2\). We make the assumption that any future
+sampled datapoints will likely be similar in value to the data we've
+already drawn. This means that our \emph{kernel} -- our description of
+the probability of randomly sampling any new value -- will be greatest
+at the datapoint we've already drawn but still have non-zero probability
+above and below it. The area under any kernel should integrate to 1,
+representing the total probability of drawing a new datapoint.
+
+A \textbf{bandwidth value}, usually denoted by \(\alpha\), represents
+the width of the kernel. A large value of \(\alpha\) will result in a
+wide, short kernel function, while a small value with result in a
+narrow, tall kernel.
+
+Below, we place a \textbf{Gaussian kernel}, plotted in orange, over the
+datapoint \(2.2\). A Gaussian kernel is simply the normal distribution,
+which you may have called a bell curve in Data 8.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ gaussian\_kernel(x, z, a):}
+    \CommentTok{\# We\textquotesingle{}ll discuss where this mathematical formulation came from later}
+    \ControlFlowTok{return}\NormalTok{ (}\DecValTok{1}\OperatorTok{/}\NormalTok{np.sqrt(}\DecValTok{2}\OperatorTok{*}\NormalTok{np.pi}\OperatorTok{*}\NormalTok{a}\OperatorTok{**}\DecValTok{2}\NormalTok{)) }\OperatorTok{*}\NormalTok{ np.exp((}\OperatorTok{{-}}\NormalTok{(x }\OperatorTok{{-}}\NormalTok{ z)}\OperatorTok{**}\DecValTok{2} \OperatorTok{/}\NormalTok{ (}\DecValTok{2} \OperatorTok{*}\NormalTok{ a}\OperatorTok{**}\DecValTok{2}\NormalTok{)))}
+
+\CommentTok{\# Plot our datapoint}
+\NormalTok{sns.rugplot([}\FloatTok{2.2}\NormalTok{], height}\OperatorTok{=}\FloatTok{0.3}\NormalTok{)}
+
+\CommentTok{\# Plot the kernel}
+\NormalTok{x }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{1000}\NormalTok{)}
+\NormalTok{plt.plot(x, gaussian\_kernel(x, }\FloatTok{2.2}\NormalTok{, }\DecValTok{1}\NormalTok{))}
+
+\NormalTok{plt.xlabel(}\StringTok{"Data"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Density"}\NormalTok{)}
+\NormalTok{plt.xlim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{10}\NormalTok{)}
+\NormalTok{plt.ylim(}\DecValTok{0}\NormalTok{, }\FloatTok{0.5}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-6-output-1.pdf}
+
+To begin creating our KDE, we place a kernel on \emph{each} datapoint in
+our dataset. For our dataset of 5 points, we will have 5 kernels.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# You will work with the functions below in Lab 4}
+\KeywordTok{def}\NormalTok{ create\_kde(kernel, pts, a):}
+    \CommentTok{\# Takes in a kernel, set of points, and alpha}
+    \CommentTok{\# Returns the KDE as a function}
+    \KeywordTok{def}\NormalTok{ f(x):}
+\NormalTok{        output }\OperatorTok{=} \DecValTok{0}
+        \ControlFlowTok{for}\NormalTok{ pt }\KeywordTok{in}\NormalTok{ pts:}
+\NormalTok{            output }\OperatorTok{+=}\NormalTok{ kernel(x, pt, a)}
+        \ControlFlowTok{return}\NormalTok{ output }\OperatorTok{/} \BuiltInTok{len}\NormalTok{(pts) }\CommentTok{\# Normalization factor}
+    \ControlFlowTok{return}\NormalTok{ f}
+
+\KeywordTok{def}\NormalTok{ plot\_kde(kernel, pts, a):}
+    \CommentTok{\# Calls create\_kde and plots the corresponding KDE}
+\NormalTok{    f }\OperatorTok{=}\NormalTok{ create\_kde(kernel, pts, a)}
+\NormalTok{    x }\OperatorTok{=}\NormalTok{ np.linspace(}\BuiltInTok{min}\NormalTok{(pts) }\OperatorTok{{-}} \DecValTok{5}\NormalTok{, }\BuiltInTok{max}\NormalTok{(pts) }\OperatorTok{+} \DecValTok{5}\NormalTok{, }\DecValTok{1000}\NormalTok{)}
+\NormalTok{    y }\OperatorTok{=}\NormalTok{ [f(xi) }\ControlFlowTok{for}\NormalTok{ xi }\KeywordTok{in}\NormalTok{ x]}
+\NormalTok{    plt.plot(x, y)}\OperatorTok{;}
+    
+\KeywordTok{def}\NormalTok{ plot\_separate\_kernels(kernel, pts, a, norm}\OperatorTok{=}\VariableTok{False}\NormalTok{):}
+    \CommentTok{\# Plots individual kernels, which are then summed to create the KDE}
+\NormalTok{    x }\OperatorTok{=}\NormalTok{ np.linspace(}\BuiltInTok{min}\NormalTok{(pts) }\OperatorTok{{-}} \DecValTok{5}\NormalTok{, }\BuiltInTok{max}\NormalTok{(pts) }\OperatorTok{+} \DecValTok{5}\NormalTok{, }\DecValTok{1000}\NormalTok{)}
+    \ControlFlowTok{for}\NormalTok{ pt }\KeywordTok{in}\NormalTok{ pts:}
+\NormalTok{        y }\OperatorTok{=}\NormalTok{ kernel(x, pt, a)}
+        \ControlFlowTok{if}\NormalTok{ norm:}
+\NormalTok{            y }\OperatorTok{/=} \BuiltInTok{len}\NormalTok{(pts)}
+\NormalTok{        plt.plot(x, y)}
+    
+\NormalTok{    plt.show()}\OperatorTok{;}
+    
+\NormalTok{plt.xlim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{10}\NormalTok{)}
+\NormalTok{plt.ylim(}\DecValTok{0}\NormalTok{, }\FloatTok{0.5}\NormalTok{)}
+\NormalTok{plt.xlabel(}\StringTok{"Data"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Density"}\NormalTok{)}
+
+\NormalTok{plot\_separate\_kernels(gaussian\_kernel, data, a }\OperatorTok{=} \DecValTok{1}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-7-output-1.pdf}
+
+\subsubsection{Step 2: Normalize Kernels to Have a Total Area of
+1}\label{step-2-normalize-kernels-to-have-a-total-area-of-1}
+
+Above, we said that \emph{each} kernel has an area of 1. Earlier, we
+also said that our goal is to construct a KDE curve using these kernels
+with a \emph{total} area of 1. If we were to directly sum the kernels as
+they are, we would produce a KDE curve with an integrated area of (5
+kernels) \(\times\) (area of 1 each) = 5. To avoid this, we will
+\textbf{normalize} each of our kernels. This involves multiplying each
+kernel by \(\frac{1}{\#\:\text{datapoints}}\).
+
+In the cell below, we multiply each of our 5 kernels by \(\frac{1}{5}\)
+to apply normalization.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{plt.xlim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{10}\NormalTok{)}
+\NormalTok{plt.ylim(}\DecValTok{0}\NormalTok{, }\FloatTok{0.5}\NormalTok{)}
+\NormalTok{plt.xlabel(}\StringTok{"Data"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Density"}\NormalTok{)}
+
+\CommentTok{\# The \textasciigrave{}norm\textasciigrave{} argument specifies whether or not to normalize the kernels}
+\NormalTok{plot\_separate\_kernels(gaussian\_kernel, data, a }\OperatorTok{=} \DecValTok{1}\NormalTok{, norm }\OperatorTok{=} \VariableTok{True}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-8-output-1.pdf}
+
+\subsubsection{Step 3: Sum the Normalized
+Kernels}\label{step-3-sum-the-normalized-kernels}
+
+Our KDE curve is the sum of the normalized kernels. Notice that the
+final curve is identical to the plot generated by \texttt{sns.kdeplot}
+we saw earlier!
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{plt.xlim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{10}\NormalTok{)}
+\NormalTok{plt.ylim(}\DecValTok{0}\NormalTok{, }\FloatTok{0.5}\NormalTok{)}
+\NormalTok{plt.xlabel(}\StringTok{"Data"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Density"}\NormalTok{)}
+
+\NormalTok{plot\_kde(gaussian\_kernel, data, a }\OperatorTok{=} \DecValTok{1}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-9-output-1.pdf}
+
+\subsection{Kernel Functions and
+Bandwidths}\label{kernel-functions-and-bandwidths}
+
+A general ``KDE formula'' function is given above.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \(K_{\alpha}(x, x_i)\) is the kernel centered on the observation
+  \texttt{i}.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Each kernel individually has area 1.
+  \item
+    x represents any number on the number line. It is the input to our
+    function.
+  \end{itemize}
+\item
+  \(n\) is the number of observed datapoints that we have.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    We multiply by \(\frac{1}{n}\) so that the total area of the KDE is
+    still 1.
+  \end{itemize}
+\item
+  Each \(x_i \in \{x_1, x_2, \dots, x_n\}\) represents an observed
+  datapoint.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    These are what we use to create our KDE by summing multiple shifted
+    kernels centered at these points.
+  \end{itemize}
+\end{enumerate}
+
+\begin{itemize}
+\tightlist
+\item
+  \(\alpha\) (alpha) is the bandwidth or smoothing parameter.
+\end{itemize}
+
+A \textbf{kernel} (for our purposes) is a valid density function. This
+means it:
+
+\begin{itemize}
+\tightlist
+\item
+  Must be non-negative for all inputs.
+\item
+  Must integrate to 1.
+\end{itemize}
+
+\subsubsection{Gaussian Kernel}\label{gaussian-kernel}
+
+The most common kernel is the \textbf{Gaussian kernel}. The Gaussian
+kernel is equivalent to the Gaussian probability density function (the
+Normal distribution), centered at the observed value with a standard
+deviation of (this is known as the \textbf{bandwidth} parameter).
+
+\[K_a(x, x_i) = \frac{1}{\sqrt{2\pi\alpha^{2}}}e^{-\frac{(x-x_i)^{2}}{2\alpha^{2}}}\]
+
+In this formula:
+
+\begin{itemize}
+\tightlist
+\item
+  \(x\) (no subscript) represents any value along the x-axis of our plot
+\item
+  \(x_i\) represents the \(i\) -th datapoint in our dataset. It is one
+  of the values that we have actually collected in our data sampling
+  process. In our example earlier, \(x_i=2.2\). Those of you who have
+  taken a probability class may recognize \(x_i\) as the \textbf{mean}
+  of the normal distribution.
+\item
+  Each kernel is \textbf{centered} on our observed values, so its
+  distribution mean is \(x_i\).
+\item
+  \(\alpha\) is the bandwidth parameter, representing the width of our
+  kernel. More formally, \(\alpha\) is the \textbf{standard deviation}
+  of the Gaussian curve.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    A large value of \(\alpha\) will produce a kernel that is wider and
+    shorter -- this leads to a smoother KDE when the kernels are summed
+    together.
+  \item
+    A small value of \(\alpha\) will produce a narrower, taller kernel,
+    and, with it, a noisier KDE.
+  \end{itemize}
+\end{itemize}
+
+The details of this (admittedly intimidating) formula are less important
+than understanding its role in kernel density estimation -- this
+equation gives us the shape of each kernel.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Gaussian Kernel, \(\alpha\) = 0.1
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Gaussian Kernel, \(\alpha\) = 1
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+& \\
+\end{longtable}
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Gaussian Kernel, \(\alpha\) = 2
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Gaussian Kernel, \(\alpha\) = 10
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+& \\
+\end{longtable}
+
+\subsubsection{Boxcar Kernel}\label{boxcar-kernel}
+
+Another example of a kernel is the \textbf{Boxcar kernel}. The boxcar
+kernel assigns a uniform density to points within a ``window'' of the
+observation, and a density of 0 elsewhere. The equation below is a
+boxcar kernel with the center at \(x_i\) and the bandwidth of
+\(\alpha\).
+
+\[K_a(x, x_i) = \begin{cases}
+        \frac{1}{\alpha}, & |x - x_i| \le \frac{\alpha}{2}\\
+        0, & \text{else }
+    \end{cases}\]
+
+The boxcar kernel is seldom used in practice -- we include it here to
+demonstrate that a kernel function can take whatever form you would
+like, provided it integrates to 1 and does not output negative values.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ boxcar\_kernel(alpha, x, z):}
+    \ControlFlowTok{return}\NormalTok{ (((x}\OperatorTok{{-}}\NormalTok{z)}\OperatorTok{\textgreater{}={-}}\NormalTok{alpha}\OperatorTok{/}\DecValTok{2}\NormalTok{)}\OperatorTok{\&}\NormalTok{((x}\OperatorTok{{-}}\NormalTok{z)}\OperatorTok{\textless{}=}\NormalTok{alpha}\OperatorTok{/}\DecValTok{2}\NormalTok{))}\OperatorTok{/}\NormalTok{alpha}
+
+\NormalTok{xs }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{5}\NormalTok{, }\DecValTok{5}\NormalTok{, }\DecValTok{200}\NormalTok{)}
+\NormalTok{alpha}\OperatorTok{=}\DecValTok{1}
+\NormalTok{kde\_curve }\OperatorTok{=}\NormalTok{ [boxcar\_kernel(alpha, x, }\DecValTok{0}\NormalTok{) }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ xs]}
+\NormalTok{plt.plot(xs, kde\_curve)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{figure}[H]
+
+{\centering \includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-10-output-1.pdf}
+
+}
+
+\caption{The Boxcar kernel centered at 0 with bandwidth \(\alpha\) = 1.}
+
+\end{figure}%
+
+The diagram on the right is how the density curve for our 5 point
+dataset would have looked had we used the Boxcar kernel with bandwidth
+\(\alpha\) = 1.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+KDE
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Boxcar
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+& \\
+\end{longtable}
+
+\section{\texorpdfstring{Diving Deeper into
+\texttt{displot}}{Diving Deeper into displot}}\label{diving-deeper-into-displot}
+
+As we saw earlier, we can use \texttt{seaborn}'s \texttt{displot}
+function to plot various distributions. In particular, \texttt{displot}
+allows you to specify the \texttt{kind} of plot and is a wrapper for
+\texttt{histplot}, \texttt{kdeplot}, and \texttt{ecdfplot}.
+
+Below, we can see a couple of examples of how \texttt{sns.displot} can
+be used to plot various distributions.
+
+First, we can plot a histogram by setting \texttt{kind} to
+\texttt{"hist"}. Note that here we've specified
+\texttt{stat\ =\ density} to normalize the histogram such that the area
+under the histogram is equal to 1.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.displot(data}\OperatorTok{=}\NormalTok{wb, }
+\NormalTok{            x}\OperatorTok{=}\StringTok{"gni"}\NormalTok{, }
+\NormalTok{            kind}\OperatorTok{=}\StringTok{"hist"}\NormalTok{, }
+\NormalTok{            stat}\OperatorTok{=}\StringTok{"density"}\NormalTok{) }\CommentTok{\# default: stat=count and density integrates to 1}
+\NormalTok{plt.title(}\StringTok{"Distribution of gross national income per capita"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-11-output-1.pdf}
+
+Now, what if we want to generate a KDE plot? We can set \texttt{kind} =
+to \texttt{"kde"}!
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.displot(data}\OperatorTok{=}\NormalTok{wb, }
+\NormalTok{            x}\OperatorTok{=}\StringTok{"gni"}\NormalTok{, }
+\NormalTok{            kind}\OperatorTok{=}\StringTok{\textquotesingle{}kde\textquotesingle{}}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"Distribution of gross national income per capita"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-12-output-1.pdf}
+
+And finally, if we want to generate an Empirical Cumulative Distribution
+Function (ECDF), we can specify \texttt{kind\ =\ "ecdf"}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.displot(data}\OperatorTok{=}\NormalTok{wb, }
+\NormalTok{            x}\OperatorTok{=}\StringTok{"gni"}\NormalTok{, }
+\NormalTok{            kind}\OperatorTok{=}\StringTok{\textquotesingle{}ecdf\textquotesingle{}}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"Cumulative Distribution of gross national income per capita"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-13-output-1.pdf}
+
+\section{Relationships Between Quantitative
+Variables}\label{relationships-between-quantitative-variables}
+
+Up until now, we've discussed how to visualize single-variable
+distributions. Going beyond this, we want to understand the relationship
+between pairs of numerical variables.
+
+\subsubsection{Scatter Plots}\label{scatter-plots}
+
+\textbf{Scatter plots} are one of the most useful tools in representing
+the relationship between \textbf{pairs} of quantitative variables. They
+are particularly important in gauging the strength, or correlation, of
+the relationship between variables. Knowledge of these relationships can
+then motivate decisions in our modeling process.
+
+In \texttt{matplotlib}, we use the function \texttt{plt.scatter} to
+generate a scatter plot. Notice that, unlike our examples of plotting
+single-variable distributions, now we specify sequences of values to be
+plotted along the x-axis \emph{and} the y-axis.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{plt.scatter(wb[}\StringTok{"per capita: }\SpecialCharTok{\% g}\StringTok{rowth: 2016"}\NormalTok{], }\OperatorTok{\textbackslash{}}
+\NormalTok{            wb[}\StringTok{\textquotesingle{}Adult literacy rate: Female: \% ages 15 and older: 2005{-}14\textquotesingle{}}\NormalTok{])}
+
+\NormalTok{plt.xlabel(}\StringTok{"}\SpecialCharTok{\% g}\StringTok{rowth per capita"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Female adult literacy rate"}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"Female adult literacy against }\SpecialCharTok{\% g}\StringTok{rowth"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-14-output-1.pdf}
+
+In \texttt{seaborn}, we call the function \texttt{sns.scatterplot}. We
+use the \texttt{x} and \texttt{y} parameters to indicate the values to
+be plotted along the x and y axes, respectively. By using the
+\texttt{hue} parameter, we can specify a third variable to be used for
+coloring each scatter point.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.scatterplot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{"per capita: }\SpecialCharTok{\% g}\StringTok{rowth: 2016"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+\NormalTok{               y }\OperatorTok{=} \StringTok{"Adult literacy rate: Female: \% ages 15 and older: 2005{-}14"}\NormalTok{, }
+\NormalTok{               hue }\OperatorTok{=} \StringTok{"Continent"}\NormalTok{)}
+
+\NormalTok{plt.title(}\StringTok{"Female adult literacy against }\SpecialCharTok{\% g}\StringTok{rowth"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-15-output-1.pdf}
+
+\paragraph{Overplotting}\label{overplotting}
+
+Although the plots above communicate the general relationship between
+the two plotted variables, they both suffer a major limitation --
+\textbf{overplotting}. Overplotting occurs when scatter points with
+similar values are stacked on top of one another, making it difficult to
+see the number of scatter points actually plotted in the visualization.
+Notice how in the upper righthand region of the plots, we cannot easily
+tell just how many points have been plotted. This makes our
+visualizations difficult to interpret.
+
+We have a few methods to help reduce overplotting:
+
+\begin{itemize}
+\tightlist
+\item
+  Decreasing the size of the scatter point markers can improve
+  readability. We do this by setting a new value to the size parameter,
+  \texttt{s}, of \texttt{plt.scatter} or \texttt{sns.scatterplot}.
+\item
+  \textbf{Jittering} is the process of adding a small amount of random
+  noise to all x and y values to slightly shift the position of each
+  datapoint. By randomly shifting all the data by some small distance,
+  we can discern individual points more clearly without modifying the
+  major trends of the original dataset.
+\end{itemize}
+
+In the cell below, we first jitter the data using
+\texttt{np.random.uniform}, then re-plot it with smaller markers. The
+resulting plot is much easier to interpret.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Setting a seed ensures that we produce the same plot each time}
+\CommentTok{\# This means that the course notes will not change each time you access them}
+\NormalTok{np.random.seed(}\DecValTok{150}\NormalTok{)}
+
+\CommentTok{\# This call to np.random.uniform generates random numbers between {-}1 and 1}
+\CommentTok{\# We add these random numbers to the original x data to jitter it slightly}
+\NormalTok{x\_noise }\OperatorTok{=}\NormalTok{ np.random.uniform(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\BuiltInTok{len}\NormalTok{(wb))}
+\NormalTok{jittered\_x }\OperatorTok{=}\NormalTok{ wb[}\StringTok{"per capita: }\SpecialCharTok{\% g}\StringTok{rowth: 2016"}\NormalTok{] }\OperatorTok{+}\NormalTok{ x\_noise}
+
+\CommentTok{\# Repeat for y data}
+\NormalTok{y\_noise }\OperatorTok{=}\NormalTok{ np.random.uniform(}\OperatorTok{{-}}\DecValTok{5}\NormalTok{, }\DecValTok{5}\NormalTok{, }\BuiltInTok{len}\NormalTok{(wb))}
+\NormalTok{jittered\_y }\OperatorTok{=}\NormalTok{ wb[}\StringTok{"Adult literacy rate: Female: \% ages 15 and older: 2005{-}14"}\NormalTok{] }\OperatorTok{+}\NormalTok{ y\_noise}
+
+\CommentTok{\# Setting the size parameter \textasciigrave{}s\textasciigrave{} changes the size of each point}
+\NormalTok{plt.scatter(jittered\_x, jittered\_y, s}\OperatorTok{=}\DecValTok{15}\NormalTok{)}
+
+\NormalTok{plt.xlabel(}\StringTok{"}\SpecialCharTok{\% g}\StringTok{rowth per capita (jittered)"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Female adult literacy rate (jittered)"}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"Female adult literacy against }\SpecialCharTok{\% g}\StringTok{rowth"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-16-output-1.pdf}
+
+\subsubsection{\texorpdfstring{\texttt{lmplot} and
+\texttt{jointplot}}{lmplot and jointplot}}\label{lmplot-and-jointplot}
+
+\texttt{seaborn} also includes several built-in functions for creating
+more sophisticated scatter plots. Two of the most commonly used examples
+are \texttt{sns.lmplot} and \texttt{sns.jointplot}.
+
+\texttt{sns.lmplot} plots both a scatter plot \emph{and} a linear
+regression line, all in one function call. We'll discuss linear
+regression in a few lectures.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.lmplot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{"per capita: }\SpecialCharTok{\% g}\StringTok{rowth: 2016"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+\NormalTok{           y }\OperatorTok{=} \StringTok{"Adult literacy rate: Female: \% ages 15 and older: 2005{-}14"}\NormalTok{)}
+
+\NormalTok{plt.title(}\StringTok{"Female adult literacy against }\SpecialCharTok{\% g}\StringTok{rowth"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-17-output-1.pdf}
+
+\texttt{sns.jointplot} creates a visualization with three components: a
+scatter plot, a histogram of the distribution of x values, and a
+histogram of the distribution of y values.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.jointplot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{"per capita: }\SpecialCharTok{\% g}\StringTok{rowth: 2016"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+\NormalTok{           y }\OperatorTok{=} \StringTok{"Adult literacy rate: Female: \% ages 15 and older: 2005{-}14"}\NormalTok{)}
+
+\CommentTok{\# plt.suptitle allows us to shift the title up so it does not overlap with the histogram}
+\NormalTok{plt.suptitle(}\StringTok{"Female adult literacy against }\SpecialCharTok{\% g}\StringTok{rowth"}\NormalTok{)}
+\NormalTok{plt.subplots\_adjust(top}\OperatorTok{=}\FloatTok{0.9}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-18-output-1.pdf}
+
+\subsubsection{Hex plots}\label{hex-plots}
+
+For datasets with a very large number of datapoints, jittering is
+unlikely to fully resolve the issue of overplotting. In these cases, we
+can attempt to visualize our data by its \emph{density}, rather than
+displaying each individual datapoint.
+
+\textbf{Hex plots} can be thought of as two-dimensional histograms that
+show the joint distribution between two variables. This is particularly
+useful when working with very dense data. In a hex plot, the x-y plane
+is binned into hexagons. Hexagons that are darker in color indicate a
+greater density of data -- that is, there are more data points that lie
+in the region enclosed by the hexagon.
+
+We can generate a hex plot using \texttt{sns.jointplot} modified with
+the \texttt{kind} parameter.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.jointplot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{"per capita: }\SpecialCharTok{\% g}\StringTok{rowth: 2016"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+\NormalTok{              y }\OperatorTok{=} \StringTok{"Adult literacy rate: Female: \% ages 15 and older: 2005{-}14"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+\NormalTok{              kind }\OperatorTok{=} \StringTok{"hex"}\NormalTok{)}
+
+\CommentTok{\# plt.suptitle allows us to shift the title up so it does not overlap with the histogram}
+\NormalTok{plt.suptitle(}\StringTok{"Female adult literacy against }\SpecialCharTok{\% g}\StringTok{rowth"}\NormalTok{)}
+\NormalTok{plt.subplots\_adjust(top}\OperatorTok{=}\FloatTok{0.9}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-19-output-1.pdf}
+
+\subsubsection{Contour Plots}\label{contour-plots}
+
+\textbf{Contour plots} are an alternative way of plotting the joint
+distribution of two variables. You can think of them as the
+2-dimensional versions of KDE plots. A contour plot can be interpreted
+in a similar way to a
+\href{https://gisgeography.com/contour-lines-topographic-map/}{topographic
+map}. Each contour line represents an area that has the same
+\emph{density} of datapoints throughout the region. Contours marked with
+darker colors contain more datapoints (a higher density) in that region.
+
+\texttt{sns.kdeplot} will generate a contour plot if we specify both x
+and y data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.kdeplot(data }\OperatorTok{=}\NormalTok{ wb, x }\OperatorTok{=} \StringTok{"per capita: }\SpecialCharTok{\% g}\StringTok{rowth: 2016"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+\NormalTok{            y }\OperatorTok{=} \StringTok{"Adult literacy rate: Female: \% ages 15 and older: 2005{-}14"}\NormalTok{, }\OperatorTok{\textbackslash{}}
+\NormalTok{            fill }\OperatorTok{=} \VariableTok{True}\NormalTok{)}
+
+\NormalTok{plt.title(}\StringTok{"Female adult literacy against }\SpecialCharTok{\% g}\StringTok{rowth"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-20-output-1.pdf}
+
+\section{Transformations}\label{transformations}
+
+We have now covered visualizations in great depth, looking into various
+forms of visualizations, plotting libraries, and high-level theory.
+
+Much of this was done to uncover insights in data, which will prove
+necessary when we begin building models of data later in the course. A
+strong graphical correlation between two variables hints at an
+underlying relationship that we may want to study in greater detail.
+However, relying on visual relationships alone is limiting - not all
+plots show association. The presence of outliers and other statistical
+anomalies makes it hard to interpret data.
+
+\textbf{Transformations} are the process of manipulating data to find
+significant relationships between variables. These are often found by
+applying mathematical functions to variables that ``transform'' their
+range of possible values and highlight some previously hidden
+associations between data.
+
+To see why we may want to transform data, consider the following plot of
+adult literacy rates against gross national income.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Some data cleaning to help with the next example}
+\NormalTok{df }\OperatorTok{=}\NormalTok{ pd.DataFrame(index}\OperatorTok{=}\NormalTok{wb.index)}
+\NormalTok{df[}\StringTok{\textquotesingle{}lit\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ wb[}\StringTok{\textquotesingle{}Adult literacy rate: Female: \% ages 15 and older: 2005{-}14\textquotesingle{}}\NormalTok{] }\OperatorTok{\textbackslash{}}
+            \OperatorTok{+}\NormalTok{ wb[}\StringTok{"Adult literacy rate: Male: \% ages 15 and older: 2005{-}14"}\NormalTok{]}
+\NormalTok{df[}\StringTok{\textquotesingle{}inc\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ wb[}\StringTok{\textquotesingle{}gni\textquotesingle{}}\NormalTok{]}
+\NormalTok{df.dropna(inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+
+\NormalTok{plt.scatter(df[}\StringTok{"inc"}\NormalTok{], df[}\StringTok{"lit"}\NormalTok{])}
+\NormalTok{plt.xlabel(}\StringTok{"Gross national income per capita"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Adult literacy rate"}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"Adult literacy rate against GNI per capita"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-21-output-1.pdf}
+
+This plot is difficult to interpret for two reasons:
+
+\begin{itemize}
+\tightlist
+\item
+  The data shown in the visualization appears almost ``smushed'' -- it
+  is heavily concentrated in the upper lefthand region of the plot. Even
+  if we jittered the dataset, we likely would not be able to fully
+  assess all datapoints in that area.
+\item
+  It is hard to generalize a clear relationship between the two plotted
+  variables. While adult literacy rate appears to share some positive
+  relationship with gross national income, we are not able to describe
+  the specifics of this trend in much detail.
+\end{itemize}
+
+A transformation would allow us to visualize this data more clearly,
+which, in turn, would enable us to describe the underlying relationship
+between our variables of interest.
+
+We will most commonly apply a transformation to \textbf{linearize a
+relationship} between variables. If we find a transformation to make a
+scatter plot of two variables linear, we can ``backtrack'' to find the
+exact relationship between the variables. This helps us in two major
+ways. Firstly, linear relationships are particularly simple to interpret
+-- we have an intuitive sense of what the slope and intercept of a
+linear trend represent, and how they can help us understand the
+relationship between two variables. Secondly, linear relationships are
+the backbone of linear models. We will begin exploring linear modeling
+in great detail next week. As we'll soon see, linear models become much
+more effective when we are working with linearized data.
+
+In the remainder of this note, we will discuss how to linearize a
+dataset to produce the result below. Notice that the resulting plot
+displays a rough linear relationship between the values plotted on the x
+and y axes.
+
+\subsection{Linearization and Applying
+Transformations}\label{linearization-and-applying-transformations}
+
+To linearize a relationship, begin by asking yourself: what makes the
+data non-linear? It is helpful to repeat this question for each variable
+in your visualization.
+
+Let's start by considering the gross national income variable in our
+plot above. Looking at the y values in the scatter plot, we can see that
+many large y values are all clumped together, compressing the vertical
+axis. The scale of the horizontal axis is also being distorted by the
+few large outlying x values on the right.
+
+If we decreased the size of these outliers relative to the bulk of the
+data, we could reduce the distortion of the horizontal axis. How can we
+do this? We need a transformation that will:
+
+\begin{itemize}
+\tightlist
+\item
+  Decrease the magnitude of large x values by a significant amount.
+\item
+  Not drastically change the magnitude of small x values.
+\end{itemize}
+
+One function that produces this result is the \textbf{log
+transformation}. When we take the logarithm of a large number, the
+original number will decrease in magnitude dramatically. Conversely,
+when we take the logarithm of a small number, the original number does
+not change its value by as significant of an amount (to illustrate this,
+consider the difference between \(\log{(100)} = 4.61\) and
+\(\log{(10)} = 2.3\)).
+
+In Data 100 (and most upper-division STEM classes), \(\log\) is used to
+refer to the natural logarithm with base \(e\).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# np.log takes the logarithm of an array or Series}
+\NormalTok{plt.scatter(np.log(df[}\StringTok{"inc"}\NormalTok{]), df[}\StringTok{"lit"}\NormalTok{])}
+
+\NormalTok{plt.xlabel(}\StringTok{"Log(gross national income per capita)"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Adult literacy rate"}\NormalTok{)}
+\NormalTok{plt.title(}\StringTok{"Adult literacy rate against Log(GNI per capita)"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-22-output-1.pdf}
+
+After taking the logarithm of our x values, our plot appears much more
+balanced in its horizontal scale. We no longer have many datapoints
+clumped on one end and a few outliers out at extreme values.
+
+Let's repeat this reasoning for the y values. Considering only the
+vertical axis of the plot, notice how there are many datapoints
+concentrated at large y values. Only a few datapoints lie at smaller
+values of y.
+
+If we were to ``spread out'' these large values of y more, we would no
+longer see the dense concentration in one region of the y-axis. We need
+a transformation that will:
+
+\begin{itemize}
+\tightlist
+\item
+  Increase the magnitude of large values of y so these datapoints are
+  distributed more broadly on the vertical scale,
+\item
+  Not substantially alter the scaling of small values of y (we do not
+  want to drastically modify the lower end of the y axis, which is
+  already distributed evenly on the vertical scale).
+\end{itemize}
+
+In this case, it is helpful to apply a \textbf{power transformation} --
+that is, raise our y values to a power. Let's try raising our adult
+literacy rate values to the power of 4. Large values raised to the power
+of 4 will increase in magnitude proportionally much more than small
+values raised to the power of 4 (consider the difference between
+\(2^4 = 16\) and \(200^4 = 1600000000\)).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Apply a log transformation to the x values and a power transformation to the y values}
+\NormalTok{plt.scatter(np.log(df[}\StringTok{"inc"}\NormalTok{]), df[}\StringTok{"lit"}\NormalTok{]}\OperatorTok{**}\DecValTok{4}\NormalTok{)}
+
+\NormalTok{plt.xlabel(}\StringTok{"Log(gross national income per capita)"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Adult literacy rate (4th power)"}\NormalTok{)}
+\NormalTok{plt.suptitle(}\StringTok{"Adult literacy rate (4th power) against Log(GNI per capita)"}\NormalTok{)}
+\NormalTok{plt.subplots\_adjust(top}\OperatorTok{=}\FloatTok{0.9}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-23-output-1.pdf}
+
+Our scatter plot is looking a lot better! Now, we are plotting the log
+of our original x values on the horizontal axis, and the 4th power of
+our original y values on the vertical axis. We start to see an
+approximate \emph{linear} relationship between our transformed
+variables.
+
+What can we take away from this? We now know that the log of gross
+national income and adult literacy to the power of 4 are roughly
+linearly related. If we denote the original, untransformed gross
+national income values as \(x\) and the original adult literacy rate
+values as \(y\), we can use the standard form of a linear fit to express
+this relationship:
+
+\[y^4 = m(\log{x}) + b\]
+
+Where \(m\) represents the slope of the linear fit, while \(b\)
+represents the intercept.
+
+The cell below computes \(m\) and \(b\) for our transformed data. We'll
+discuss how this code was generated in a future lecture.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# The code below fits a linear regression model. We\textquotesingle{}ll discuss it at length in a future lecture}
+\ImportTok{from}\NormalTok{ sklearn.linear\_model }\ImportTok{import}\NormalTok{ LinearRegression}
+
+\NormalTok{model }\OperatorTok{=}\NormalTok{ LinearRegression()}
+\NormalTok{model.fit(np.log(df[[}\StringTok{"inc"}\NormalTok{]]), df[}\StringTok{"lit"}\NormalTok{]}\OperatorTok{**}\DecValTok{4}\NormalTok{)}
+\NormalTok{m, b }\OperatorTok{=}\NormalTok{ model.coef\_[}\DecValTok{0}\NormalTok{], model.intercept\_}
+
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"The slope, m, of the transformed data is: }\SpecialCharTok{\{}\NormalTok{m}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"The intercept, b, of the transformed data is: }\SpecialCharTok{\{}\NormalTok{b}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+
+\NormalTok{df }\OperatorTok{=}\NormalTok{ df.sort\_values(}\StringTok{"inc"}\NormalTok{)}
+\NormalTok{plt.scatter(np.log(df[}\StringTok{"inc"}\NormalTok{]), df[}\StringTok{"lit"}\NormalTok{]}\OperatorTok{**}\DecValTok{4}\NormalTok{, label}\OperatorTok{=}\StringTok{"Transformed data"}\NormalTok{)}
+\NormalTok{plt.plot(np.log(df[}\StringTok{"inc"}\NormalTok{]), m}\OperatorTok{*}\NormalTok{np.log(df[}\StringTok{"inc"}\NormalTok{])}\OperatorTok{+}\NormalTok{b, c}\OperatorTok{=}\StringTok{"red"}\NormalTok{, label}\OperatorTok{=}\StringTok{"Linear regression"}\NormalTok{)}
+\NormalTok{plt.xlabel(}\StringTok{"Log(gross national income per capita)"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Adult literacy rate (4th power)"}\NormalTok{)}
+\NormalTok{plt.legend()}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+The slope, m, of the transformed data is: 336400693.43172705
+The intercept, b, of the transformed data is: -1802204836.0479987
+\end{verbatim}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-24-output-2.pdf}
+
+What if we want to understand the \emph{underlying} relationship between
+our original variables, before they were transformed? We can simply
+rearrange our linear expression above!
+
+Recall our linear relationship between the transformed variables
+\(\log{x}\) and \(y^4\).
+
+\[y^4 = m(\log{x}) + b\]
+
+By rearranging the equation, we find a relationship between the
+untransformed variables \(x\) and \(y\).
+
+\[y = [m(\log{x}) + b]^{(1/4)}\]
+
+When we plug in the values for \(m\) and \(b\) computed above, something
+interesting happens.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Now, plug the values for m and b into the relationship between the untransformed x and y}
+\NormalTok{plt.scatter(df[}\StringTok{"inc"}\NormalTok{], df[}\StringTok{"lit"}\NormalTok{], label}\OperatorTok{=}\StringTok{"Untransformed data"}\NormalTok{)}
+\NormalTok{plt.plot(df[}\StringTok{"inc"}\NormalTok{], (m}\OperatorTok{*}\NormalTok{np.log(df[}\StringTok{"inc"}\NormalTok{])}\OperatorTok{+}\NormalTok{b)}\OperatorTok{**}\NormalTok{(}\DecValTok{1}\OperatorTok{/}\DecValTok{4}\NormalTok{), c}\OperatorTok{=}\StringTok{"red"}\NormalTok{, label}\OperatorTok{=}\StringTok{"Modeled relationship"}\NormalTok{)}
+\NormalTok{plt.xlabel(}\StringTok{"Gross national income per capita"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Adult literacy rate"}\NormalTok{)}
+\NormalTok{plt.legend()}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{visualization_2/visualization_2_files/figure-pdf/cell-25-output-1.pdf}
+
+We have found a relationship between our original variables -- gross
+national income and adult literacy rate!
+
+Transformations are powerful tools for understanding our data in greater
+detail. To summarize what we just achieved:
+
+\begin{itemize}
+\tightlist
+\item
+  We identified appropriate transformations to \textbf{linearize} the
+  original data.
+\item
+  We used our knowledge of linear curves to compute the slope and
+  intercept of the transformed data.
+\item
+  We used this slope and intercept information to derive a relationship
+  in the untransformed data.
+\end{itemize}
+
+Linearization will be an important tool as we begin our work on linear
+modeling next week.
+
+\subsubsection{Tukey-Mosteller Bulge
+Diagram}\label{tukey-mosteller-bulge-diagram}
+
+The \textbf{Tukey-Mosteller Bulge Diagram} is a good guide when
+determining possible transformations to achieve linearity. It is a
+visual summary of the reasoning we just worked through above.
+
+How does it work? Each curved ``bulge'' represents a possible shape of
+non-linear data. To use the diagram, find which of the four bulges
+resembles your dataset the most closely. Then, look at the axes of the
+quadrant for this bulge. The horizontal axis will list possible
+transformations that could be applied to your x data for linearization.
+Similarly, the vertical axis will list possible transformations that
+could be applied to your y data. Note that each axis lists two possible
+transformations. While \emph{either} of these transformations has the
+\emph{potential} to linearize your dataset, note that this is an
+iterative process. It's important to try out these transformations and
+look at the results to see whether you've actually achieved linearity.
+If not, you'll need to continue testing other possible transformations.
+
+Generally:
+
+\begin{itemize}
+\tightlist
+\item
+  \(\sqrt{}\) and \(\log{}\) will reduce the magnitude of large values.
+\item
+  Powers (\(^2\) and \(^3\)) will increase the spread in magnitude of
+  large values.
+\end{itemize}
+
+\textbf{Important:} You should still understand the \emph{logic} we
+worked through to determine how best to transform the data. The bulge
+diagram is just a summary of this same reasoning. You will be expected
+to be able to explain why a given transformation is or is not
+appropriate for linearization.
+
+\subsection{Additional Remarks}\label{additional-remarks}
+
+Visualization requires a lot of thought!
+
+\begin{itemize}
+\tightlist
+\item
+  There are many tools for visualizing distributions.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Distribution of a single variable:
+
+    \begin{enumerate}
+    \def\labelenumi{\arabic{enumi}.}
+    \tightlist
+    \item
+      Rugplot
+    \item
+      Histogram
+    \item
+      Density plot
+    \item
+      Box plot
+    \item
+      Violin plot
+    \end{enumerate}
+  \item
+    Joint distribution of two quantitative variables:
+
+    \begin{enumerate}
+    \def\labelenumi{\arabic{enumi}.}
+    \tightlist
+    \item
+      Scatter plot
+    \item
+      Hex plot
+    \item
+      Contour plot
+    \end{enumerate}
+  \end{itemize}
+\end{itemize}
+
+This class primarily uses \texttt{seaborn} and \texttt{matplotlib}, but
+\texttt{pandas} also has basic built-in plotting methods. Many other
+visualization libraries exist, and \texttt{plotly} is one of them.
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{plotly} creates very easily creates interactive plots.
+\item
+  \texttt{plotly} will occasionally appear in lecture code, labs, and
+  assignments!
+\end{itemize}
+
+Next, we'll go deeper into the theory behind visualization.
+
+\section{Visualization Theory}\label{visualization-theory}
+
+This section marks a pivot to the second major topic of this lecture -
+visualization theory. We'll discuss the abstract nature of
+visualizations and analyze how they convey information.
+
+Remember, we had two goals for visualizing data. This section is
+particularly important in:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Helping us understand the data and results,
+\item
+  Communicating our results and conclusions with others.
+\end{enumerate}
+
+\subsection{Information Channels}\label{information-channels}
+
+Visualizations are able to convey information through various encodings.
+In the remainder of this lecture, we'll look at the use of color, scale,
+and depth, to name a few.
+
+\subsubsection{Encodings in Rugplots}\label{encodings-in-rugplots}
+
+One detail that we may have overlooked in our earlier discussion of
+rugplots is the importance of encodings. Rugplots are effective visuals
+because they utilize line thickness to encode frequency. Consider the
+following diagram:
+
+\subsubsection{Multi-Dimensional
+Encodings}\label{multi-dimensional-encodings}
+
+Encodings are also useful for representing multi-dimensional data.
+Notice how the following visual highlights four distinct ``dimensions''
+of data:
+
+\begin{itemize}
+\tightlist
+\item
+  X-axis
+\item
+  Y-axis
+\item
+  Area
+\item
+  Color
+\end{itemize}
+
+The human visual perception system is only capable of visualizing data
+in a three-dimensional plane, but as you've seen, we can encode many
+more channels of information.
+
+\subsection{Harnessing the Axes}\label{harnessing-the-axes}
+
+\subsubsection{Consider the Scale of the
+Data}\label{consider-the-scale-of-the-data}
+
+However, we should be careful to not misrepresent relationships in our
+data by manipulating the scale or axes. The visualization below
+improperly portrays two seemingly independent relationships on the same
+plot. The authors have clearly changed the scale of the y-axis to
+mislead their audience.
+
+Notice how the downwards-facing line segment contains values in the
+millions, while the upwards-trending segment only contains values near
+three hundred thousand. These lines should not be intersecting.
+
+When there is a large difference in the magnitude of the data, it's
+advised to analyze percentages instead of counts. The following diagrams
+correctly display the trends in cancer screening and abortion rates.
+
+\subsubsection{Reveal the Data}\label{reveal-the-data}
+
+Great visualizations not only consider the scale of the data but also
+utilize the axes in a way that best conveys information. For example,
+data scientists commonly set certain axes limits to highlight parts of
+the visualization they are most interested in.
+
+The visualization on the right captures the trend in coronavirus cases
+during March of 2020. From only looking at the visualization on the
+left, a viewer may incorrectly believe that coronavirus began to
+skyrocket on March 4\textsuperscript{th}, 2020. However, the second
+illustration tells a different story - cases rose closer to March
+21\textsuperscript{th}, 2020.
+
+\subsection{Harnessing Color}\label{harnessing-color}
+
+Color is another important feature in visualizations that does more than
+what meets the eye.
+
+We already explored using color to encode a categorical variable in our
+scatter plot. Let's now discuss the uses of color in novel
+visualizations like colormaps and heatmaps.
+
+5-8\% of the world is red-green color blind, so we have to be very
+particular about our color scheme. We want to make these as accessible
+as possible. Choosing a set of colors that work together is evidently a
+challenging task!
+
+\subsubsection{Colormaps}\label{colormaps}
+
+Colormaps are mappings from pixel data to color values, and they're
+often used to highlight distinct parts of an image. Let's investigate a
+few properties of colormaps.
+
+\textbf{Jet Colormap}
+
+\textbf{Viridis Colormap}
+
+The jet colormap is infamous for being misleading. While it seems more
+vibrant than viridis, the aggressive colors poorly encode numerical
+data. To understand why, let's analyze the following images.
+
+The diagram on the left compares how a variety of colormaps represent
+pixel data that transitions from a high to low intensity. These include
+the jet colormap (row a) and grayscale (row b). Notice how the grayscale
+images do the best job in smoothly transitioning between pixel data. The
+jet colormap is the worst at this - the four images in row (a) look like
+a conglomeration of individual colors.
+
+The difference is also evident in the images labeled (a) and (b) on the
+left side. The grayscale image is better at preserving finer detail in
+the vertical line strokes. Additionally, grayscale is preferred in X-ray
+scans for being more neutral. The intensity of the dark red color in the
+jet colormap is frightening and indicates something is wrong.
+
+Why is the jet colormap so much worse? The answer lies in how its color
+composition is perceived to the human eye.
+
+\textbf{Jet Colormap Perception}
+
+\textbf{Viridis Colormap Perception}
+
+The jet colormap is largely misleading because it is not perceptually
+uniform. \textbf{Perceptually uniform colormaps} have the property that
+if the pixel data goes from 0.1 to 0.2, the perceptual change is the
+same as when the data goes from 0.8 to 0.9.
+
+Notice how the said uniformity is present within the linear trend
+displayed in the viridis colormap. On the other hand, the jet colormap
+is largely non-linear - this is precisely why it's considered a worse
+colormap.
+
+\subsection{Harnessing Markings}\label{harnessing-markings}
+
+In our earlier discussion of multi-dimensional encodings, we analyzed a
+scatter plot with four pseudo-dimensions: the two axes, area, and color.
+Were these appropriate to use? The following diagram analyzes how well
+the human eye can distinguish between these ``markings''.
+
+There are a few key takeaways from this diagram
+
+\begin{itemize}
+\tightlist
+\item
+  Lengths are easy to discern. Don't use plots with jiggled baselines -
+  keep everything axis-aligned.
+\item
+  Avoid pie charts! Angle judgments are inaccurate.
+\item
+  Areas and volumes are hard to distinguish (area charts, word clouds,
+  etc.).
+\end{itemize}
+
+\subsection{Harnessing Conditioning}\label{harnessing-conditioning}
+
+Conditioning is the process of comparing data that belong to separate
+groups. We've seen this before in overlayed distributions, side-by-side
+box plots, and scatter plots with categorical encodings. Here, we'll
+introduce terminology that formalizes these examples.
+
+Consider an example where we want to analyze income earnings for males
+and females with varying levels of education. There are multiple ways to
+compare this data.
+
+The barplot is an example of \textbf{juxtaposition}: placing multiple
+plots side by side, with the same scale. The scatter plot is an example
+of \textbf{superposition}: placing multiple density curves and scatter
+plots on top of each other.
+
+Which is better depends on the problem at hand. Here, superposition
+makes the precise wage difference very clear from a quick glance.
+However, many sophisticated plots convey information that favors the use
+of juxtaposition. Below is one example.
+
+\subsection{Harnessing Context}\label{harnessing-context}
+
+The last component of a great visualization is perhaps the most critical
+- the use of context. Adding informative titles, axis labels, and
+descriptive captions are all best practices that we've heard repeatedly
+in Data 8.
+
+A publication-ready plot (and every Data 100 plot) needs:
+
+\begin{itemize}
+\tightlist
+\item
+  Informative title (takeaway, not description),
+\item
+  Axis labels,
+\item
+  Reference lines, markers, etc,
+\item
+  Legends, if appropriate,
+\item
+  Captions that describe data,
+\end{itemize}
+
+Captions should:
+
+\begin{itemize}
+\tightlist
+\item
+  Be comprehensive and self-contained,
+\item
+  Describe what has been graphed,
+\item
+  Draw attention to important features,
+\item
+  Describe conclusions drawn from graphs.
+\end{itemize}
+
+\bookmarksetup{startatroot}
+
+\chapter{Sampling}\label{sampling}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Understand how to appropriately collect data to help answer a
+  question.
+\end{itemize}
+
+\end{tcolorbox}
+
+In data science, understanding characteristics of a population starts
+with having quality data to investigate. While it is often impossible to
+collect all the data describing a population, we can overcome this by
+properly sampling from the population. In this note, we will discuss
+appropriate techniques for sampling from populations.
+
+\begin{figure}[H]
+
+{\centering \includegraphics{sampling/images/data_life_cycle_sampling.png}
+
+}
+
+\caption{Lifecycle diagram}
+
+\end{figure}%
+
+\section{Censuses and Surveys}\label{censuses-and-surveys}
+
+In general: a \textbf{census} is ``a complete count or survey of a
+\textbf{population}, typically recording various details of
+\textbf{individuals}.'' An example is the U.S. Decennial Census which
+was held in April 2020. It counts \emph{every person} living in all 50
+states, DC, and US territories, not just citizens. Participation is
+required by law (it is mandated by the U.S. Constitution). Important
+uses include the allocation of Federal funds, congressional
+representation, and drawing congressional and state legislative
+districts. The census is composed of a \textbf{survey} mailed to
+different housing addresses in the United States.
+
+A \textbf{survey} is a set of questions. An example is workers sampling
+individuals and households. What is asked and how it is asked can affect
+how the respondent answers or even whether or not they answer in the
+first place.
+
+While censuses are great, it is often very difficult and expensive to
+survey everyone in a population. Imagine the amount of resources, money,
+time, and energy the U.S. spent on the 2020 Census. While this does give
+us more accurate information about the population, it's often infeasible
+to execute. Thus, we usually survey a subset of the population instead.
+
+A \textbf{sample} is (usually) a subset of the population that is often
+used to make inferences about the population. If our sample is a good
+representation of our population, then we can use it to glean useful
+information at a lower cost. That being said, how the sample is drawn
+will affect the reliability of such inferences. Two common sources of
+error in sampling are \textbf{chance error}, where random samples can
+vary from what is expected in any direction, and \textbf{bias}, which is
+a systematic error in one direction. Biases can be the result of many
+things, for example, our sampling scheme or survey methods.
+
+Let's define some useful vocabulary:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Population}: The group that you want to learn something about.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \textbf{Individuals} in a population are not always people. Other
+    populations include bacteria in your gut (sampled using DNA
+    sequencing), trees of a certain species, small businesses receiving
+    a microloan, or published results in an academic journal or field.
+  \end{itemize}
+\item
+  \textbf{Sampling Frame}: The list from which the sample is drawn.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    For example, if sampling people, then the sampling frame is the set
+    of all people that could possibly end up in your sample.
+  \end{itemize}
+\item
+  \textbf{Sample}: Who you actually end up sampling. The sample is
+  therefore a subset of your \emph{sampling frame}.
+\end{itemize}
+
+While ideally, these three sets would be exactly the same, they usually
+aren't in practice. For example, there may be individuals in your
+sampling frame (and hence, your sample) that are not in your population.
+And generally, sample sizes are much smaller than population sizes.
+
+\begin{figure}[H]
+
+{\centering \includegraphics{sampling/images/samplingframe.png}
+
+}
+
+\caption{Sampling\_Frames}
+
+\end{figure}%
+
+\section{Bias: A Case Study}\label{bias-a-case-study}
+
+The following case study is adapted from \emph{Statistics} by Freedman,
+Pisani, and Purves, W.W. Norton NY, 1978.
+
+In 1936, President Franklin D. Roosevelt (Democratic) went up for
+re-election against Alf Landon (Republican). As is usual, \textbf{polls}
+were conducted in the months leading up to the election to try and
+predict the outcome. The \emph{Literary Digest} was a magazine that had
+successfully predicted the outcome of 5 general elections coming into
+1936. In their polling for the 1936 election, they sent out their survey
+to 10 million individuals whom they found from phone books, lists of
+magazine subscribers, and lists of country club members. Of the roughly
+2.4 million people who filled out the survey, only 43\% reported they
+would vote for Roosevelt; thus, the \emph{Digest} predicted that Landon
+would win.
+
+On election day, Roosevelt won in a landslide, winning 61\% of the
+popular vote of about 45 million voters. How could the \emph{Digest}
+have been so wrong with their polling?
+
+It turns out that the \emph{Literary Digest} sample was not
+representative of the population. Their sampling frame of people found
+in phone books, lists of magazine subscribers, and lists of country club
+members were more affluent and tended to vote Republican. As such, their
+sampling frame was inherently skewed in Landon's favor. The
+\emph{Literary Digest} completely overlooked the lion's share of voters
+who were still suffering through the Great Depression. Furthermore, they
+had a dismal response rate (about 24\%); who knows how the other
+non-respondents would have polled? The \emph{Digest} folded just 18
+months after this disaster.
+
+At the same time, George Gallup, a rising statistician, also made
+predictions about the 1936 elections. Despite having a smaller sample
+size of ``only'' 50,000 (this is still more than necessary; more when we
+cover the Central Limit Theorem), his estimate that 56\% of voters would
+choose Roosevelt was much closer to the actual result (61\%). Gallup
+also predicted the \emph{Digest}'s prediction within 1\% with a sample
+size of only 3000 people by anticipating the \emph{Digest}'s affluent
+sampling frame and subsampling those individuals.
+
+So what's the moral of the story? Samples, while convenient, are subject
+to chance error and \textbf{bias}. Election polling, in particular, can
+involve many sources of bias. To name a few:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Selection bias} systematically excludes (or favors) particular
+  groups.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Example: the Literary Digest poll excludes people not in phone
+    books.
+  \item
+    How to avoid: Examine the sampling frame and the method of sampling.
+  \end{itemize}
+\item
+  \textbf{Response bias} occurs because people don't always respond
+  truthfully. Survey designers pay special detail to the nature and
+  wording of questions to avoid this type of bias.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Example: Illegal immigrants might not answer truthfully when asked
+    citizenship questions on the census survey.
+  \item
+    How to avoid: Examine the nature of questions and the method of
+    surveying. Randomized response - flip a coin and answer yes if heads
+    or answer truthfully if tails.
+  \end{itemize}
+\item
+  \textbf{Non-response bias} occurs because people don't always respond
+  to survey requests, which can skew responses.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Example: Only 2.4m out of 10m people responded to the \emph{Literary
+    Digest}'s poll.
+  \item
+    How to avoid: Keep surveys short, and be persistent.
+  \end{itemize}
+\end{itemize}
+
+\textbf{Randomized Response}
+
+Suppose you want to ask someone a sensitive question: ``Have you ever
+cheated on an exam?'' An individual may be embarrassed or afraid to
+answer truthfully and might lie or not answer the question. One solution
+is to leverage a randomized response:
+
+First, you can ask the individual to secretly flip a fair coin; you (the
+surveyor) \emph{don't} know the outcome of the coin flip.
+
+Then, you ask them to \textbf{answer ``Yes''} if the coin landed heads
+and to \textbf{answer truthfully} if the coin landed tails.
+
+The surveyor doesn't know if the \textbf{``Yes''} means that the
+\textbf{person cheated} or if it means that the \textbf{coin landed
+heads}. The individual's sensitive information remains secret. However,
+if the response is \textbf{``No''}, then the surveyor knows the
+\textbf{individual didn't cheat}. We assume the individual is
+comfortable revealing this information.
+
+Generally, we can assume that the coin lands heads 50\% of the time,
+masking the remaining 50\% of the ``No'' answers. We can therefore
+\textbf{double} the proportion of ``No'' answers to estimate the
+\textbf{true} fraction of ``No'' answers.
+
+\textbf{Election Polls}
+
+Today, the \emph{Gallup Poll} is one of the leading polls for election
+results. The many sources of biases -- who responds to polls? Do voters
+tell the truth? How can we predict turnout? -- still remain, but the
+\emph{Gallup Poll} uses several tactics to mitigate them. Within their
+sampling frame of ``civilian, non-institutionalized population'' of
+adults in telephone households in continental U.S., they use random
+digit dialing to include both listed/unlisted phone numbers and to avoid
+selection bias. Additionally, they use a within-household selection
+process to randomly select households with one or more adults. If no one
+answers, re-call multiple times to avoid non-response bias.
+
+\section{Probability Samples}\label{probability-samples}
+
+When sampling, it is essential to focus on the quality of the sample
+rather than the quantity of the sample. A huge sample size does not fix
+a bad sampling method. Our main goal is to gather a sample that is
+representative of the population it came from. In this section, we'll
+explore the different types of sampling and their pros and cons.
+
+A \textbf{convenience sample} is whatever you can get ahold of; this
+type of sampling is \emph{non-random}. Note that haphazard sampling is
+not necessarily random sampling; there are many potential sources of
+bias.
+
+In a \textbf{probability sample}, we provide the \textbf{chance} that
+any specified \textbf{set} of individuals will be in the sample
+(individuals in the population can have different chances of being
+selected; they don't all have to be uniform), and we sample at random
+based off this known chance. For this reason, probability samples are
+also called \textbf{random samples}. The randomness provides a few
+benefits:
+
+\begin{itemize}
+\tightlist
+\item
+  Because we know the source probabilities, we can \textbf{measure the
+  errors}.
+\item
+  Sampling at random gives us a more representative sample of the
+  population, which \textbf{reduces bias}. (Note: this is only the case
+  when the probability distribution we're sampling from is accurate.
+  Random samples using ``bad'' or inaccurate distributions can produce
+  biased estimates of population quantities.)
+\item
+  Probability samples allow us to \textbf{estimate} the \textbf{bias}
+  and \textbf{chance error}, which helps us \textbf{quantify
+  uncertainty} (more in a future lecture).
+\end{itemize}
+
+The real world is usually more complicated, and we often don't know the
+initial probabilities. For example, we do not generally know the
+probability that a given bacterium is in a microbiome sample or whether
+people will answer when Gallup calls landlines. That being said, still
+we try to model probability sampling to the best of our ability even
+when the sampling or measurement process is not fully under our control.
+
+A few common random sampling schemes:
+
+\begin{itemize}
+\tightlist
+\item
+  A \textbf{uniform random sample with replacement} is a sample drawn
+  \textbf{uniformly} at random \textbf{with} replacement.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Random doesn't always mean ``uniformly at random,'' but in this
+    specific context, it does.
+  \item
+    Some individuals in the population might get picked more than once.
+  \end{itemize}
+\item
+  A \textbf{simple random sample (SRS)} is a sample drawn
+  \textbf{uniformly} at random \textbf{without} replacement.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Every individual (and subset of individuals) has the same chance of
+    being selected from the sampling frame.
+  \item
+    Every pair has the same chance as every other pair.
+  \item
+    Every triple has the same chance as every other triple.
+  \item
+    And so on.
+  \end{itemize}
+\item
+  A \textbf{stratified random sample}, where random sampling is
+  performed on strata (specific groups), and the groups together compose
+  a sample.
+\end{itemize}
+
+\subsection{Example Scheme 1: Probability
+Sample}\label{example-scheme-1-probability-sample}
+
+Suppose we have 3 TA's (\textbf{A}rman, \textbf{B}oyu,
+\textbf{C}harlie): I decide to sample 2 of them as follows:
+
+\begin{itemize}
+\tightlist
+\item
+  I choose A with probability 1.0
+\item
+  I choose either B or C, each with a probability of 0.5.
+\end{itemize}
+
+We can list all the possible outcomes and their respective probabilities
+in a table:
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+Outcome & Probability \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\{A, B\} & 0.5 \\
+\{A, C\} & 0.5 \\
+\{B, C\} & 0 \\
+\end{longtable}
+
+This is a \textbf{probability sample} (though not a great one). Of the 3
+people in my population, I know the chance of getting each subset.
+Suppose I'm measuring the average distance TAs live from campus.
+
+\begin{itemize}
+\tightlist
+\item
+  This scheme does not see the entire population!
+\item
+  My estimate using the single sample I take has some chance error
+  depending on if I see AB or AC.
+\item
+  This scheme is biased towards A's response.
+\end{itemize}
+
+\subsection{Example Scheme 2: Simple Random
+Sample}\label{example-scheme-2-simple-random-sample}
+
+Consider the following sampling scheme:
+
+\begin{itemize}
+\tightlist
+\item
+  A class roster has 1100 students listed alphabetically.
+\item
+  Pick one of the first 10 students on the list at random (e.g.~Student
+  8).
+\item
+  To create your sample, take that student and every 10th student listed
+  after that (e.g.~Students 8, 18, 28, 38, etc.).
+\end{itemize}
+
+Is this a probability sample?
+
+Yes. For a sample {[}n, n + 10, n + 20, \ldots, n + 1090{]}, where 1
+\textless= n \textless= 10, the probability of that sample is 1/10.
+Otherwise, the probability is 0.
+
+Only 10 possible samples!
+
+Does each student have the same probability of being selected?
+
+Yes. Each student is chosen with a probability of 1/10.
+
+Is this a simple random sample?
+
+No.~The chance of selecting (8, 18) is 1/10; the chance of selecting (8,
+9) is 0.
+
+\subsection{Demo: Barbie v.
+Oppenheimer}\label{demo-barbie-v.-oppenheimer}
+
+We are trying to collect a sample from Berkeley residents to predict the
+which one of Barbie and Oppenheimer would perform better on their
+opening day, July 21st.
+
+First, let's grab a dataset that has every single resident in Berkeley
+(this is a fake dataset) and which movie they \textbf{actually} watched
+on July 21st.
+
+Let's load in the \texttt{movie.csv} table. We can assume that:
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{is\_male} is a boolean that indicates if a resident identifies
+  as male.
+\item
+  There are only two movies they can watch on July 21st: Barbie and
+  Oppenheimer.
+\item
+  Every resident watches a movie (either Barbie or Oppenheimer) on July
+  21st.
+\end{itemize}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+
+\NormalTok{sns.set\_theme(style}\OperatorTok{=}\StringTok{\textquotesingle{}darkgrid\textquotesingle{}}\NormalTok{, font\_scale }\OperatorTok{=} \FloatTok{1.5}\NormalTok{,}
+\NormalTok{              rc}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{:(}\DecValTok{7}\NormalTok{,}\DecValTok{5}\NormalTok{)\})}
+
+\NormalTok{rng }\OperatorTok{=}\NormalTok{ np.random.default\_rng()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{movie }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/movie.csv"}\NormalTok{)}
+
+\CommentTok{\# create a 1/0 int that indicates Barbie vote}
+\NormalTok{movie[}\StringTok{\textquotesingle{}barbie\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (movie[}\StringTok{\textquotesingle{}movie\textquotesingle{}}\NormalTok{] }\OperatorTok{==} \StringTok{\textquotesingle{}Barbie\textquotesingle{}}\NormalTok{).astype(}\BuiltInTok{int}\NormalTok{)}
+\NormalTok{movie.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& age & is\_male & movie & barbie \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 35 & False & Barbie & 1 \\
+1 & 42 & True & Oppenheimer & 0 \\
+2 & 55 & False & Barbie & 1 \\
+3 & 77 & True & Oppenheimer & 0 \\
+4 & 31 & False & Barbie & 1 \\
+\end{longtable}
+
+What fraction of Berkeley residents chose Barbie?
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{actual\_barbie }\OperatorTok{=}\NormalTok{ np.mean(movie[}\StringTok{"barbie"}\NormalTok{])}
+\NormalTok{actual\_barbie}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(0.5302792307692308)
+\end{verbatim}
+
+This is the \textbf{actual outcome} of the competition. Based on this
+result, Barbie would win. How did our sample of retirees do?
+
+\subsubsection{Convenience Sample:
+Retirees}\label{convenience-sample-retirees}
+
+Let's take a convenience sample of people who have retired
+(\textgreater= 65 years old). What proportion of them went to see Barbie
+instead of Oppenheimer?
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{convenience\_sample }\OperatorTok{=}\NormalTok{ movie[movie[}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{] }\OperatorTok{\textgreater{}=} \DecValTok{65}\NormalTok{] }\CommentTok{\# take a convenience sample of retirees}
+\NormalTok{np.mean(convenience\_sample[}\StringTok{"barbie"}\NormalTok{]) }\CommentTok{\# what proportion of them saw Barbie? }
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(0.3744755089093924)
+\end{verbatim}
+
+Based on this result, we would have predicted that Oppenheimer would
+win! What happened? Is it possible that our sample is too small or
+noisy?
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# what\textquotesingle{}s the size of our sample? }
+\BuiltInTok{len}\NormalTok{(convenience\_sample)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+359396
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# what proportion of our data is in the convenience sample? }
+\BuiltInTok{len}\NormalTok{(convenience\_sample)}\OperatorTok{/}\BuiltInTok{len}\NormalTok{(movie)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0.27645846153846154
+\end{verbatim}
+
+Seems like our sample is rather large (roughly 360,000 people), so the
+error is likely not due to solely to chance.
+
+\subsubsection{Check for Bias}\label{check-for-bias}
+
+Let us aggregate all choices by age and visualize the fraction of Barbie
+views, split by gender.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{votes\_by\_barbie }\OperatorTok{=}\NormalTok{ movie.groupby([}\StringTok{"age"}\NormalTok{,}\StringTok{"is\_male"}\NormalTok{]).agg(}\StringTok{"mean"}\NormalTok{, numeric\_only}\OperatorTok{=}\VariableTok{True}\NormalTok{).reset\_index()}
+\NormalTok{votes\_by\_barbie.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& age & is\_male & barbie \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 18 & False & 0.819594 \\
+1 & 18 & True & 0.667001 \\
+2 & 19 & False & 0.812214 \\
+3 & 19 & True & 0.661252 \\
+4 & 20 & False & 0.805281 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# A common matplotlib/seaborn pattern: create the figure and axes object, pass ax}
+\CommentTok{\# to seaborn for drawing into, and later fine{-}tune the figure via ax.}
+\NormalTok{fig, ax }\OperatorTok{=}\NormalTok{ plt.subplots()}\OperatorTok{;}
+
+\NormalTok{red\_blue }\OperatorTok{=}\NormalTok{ [}\StringTok{"\#bf1518"}\NormalTok{, }\StringTok{"\#397eb7"}\NormalTok{]}
+\ControlFlowTok{with}\NormalTok{ sns.color\_palette(red\_blue):}
+\NormalTok{    sns.pointplot(data}\OperatorTok{=}\NormalTok{votes\_by\_barbie, x }\OperatorTok{=} \StringTok{"age"}\NormalTok{, y }\OperatorTok{=} \StringTok{"barbie"}\NormalTok{, hue }\OperatorTok{=} \StringTok{"is\_male"}\NormalTok{, ax}\OperatorTok{=}\NormalTok{ax)}
+
+\NormalTok{new\_ticks }\OperatorTok{=}\NormalTok{ [i.get\_text() }\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in}\NormalTok{ ax.get\_xticklabels()]}
+\NormalTok{ax.set\_xticks(}\BuiltInTok{range}\NormalTok{(}\DecValTok{0}\NormalTok{, }\BuiltInTok{len}\NormalTok{(new\_ticks), }\DecValTok{10}\NormalTok{), new\_ticks[::}\DecValTok{10}\NormalTok{])}
+\NormalTok{ax.set\_title(}\StringTok{"Preferences by Demographics"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{sampling/sampling_files/figure-pdf/cell-9-output-1.pdf}
+
+\begin{itemize}
+\tightlist
+\item
+  We see that retirees (in Berkeley) tend to watch Oppenheimer.
+\item
+  We also see that residents who identify as non-male tend to prefer
+  Barbie.
+\end{itemize}
+
+\subsubsection{Simple Random Sample}\label{simple-random-sample}
+
+Suppose we took a simple random sample (SRS) of the same size as our
+retiree sample:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{n }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(convenience\_sample)}
+\NormalTok{random\_sample }\OperatorTok{=}\NormalTok{ movie.sample(n, replace }\OperatorTok{=} \VariableTok{False}\NormalTok{) }\CommentTok{\#\# By default, replace = False}
+\NormalTok{np.mean(random\_sample[}\StringTok{"barbie"}\NormalTok{])}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(0.5299112956182038)
+\end{verbatim}
+
+This is very close to the actual vote of 0.5302792307692308!
+
+It turns out that we can get similar results with a \textbf{much smaller
+sample size}, say, 800:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{n }\OperatorTok{=} \DecValTok{800}
+\NormalTok{random\_sample }\OperatorTok{=}\NormalTok{ movie.sample(n, replace }\OperatorTok{=} \VariableTok{False}\NormalTok{)}
+
+\CommentTok{\# Compute the sample average and the resulting relative error}
+\NormalTok{sample\_barbie }\OperatorTok{=}\NormalTok{ np.mean(random\_sample[}\StringTok{"barbie"}\NormalTok{])}
+\NormalTok{err }\OperatorTok{=} \BuiltInTok{abs}\NormalTok{(sample\_barbie}\OperatorTok{{-}}\NormalTok{actual\_barbie)}\OperatorTok{/}\NormalTok{actual\_barbie}
+
+\CommentTok{\# We can print output with Markdown formatting too...}
+\ImportTok{from}\NormalTok{ IPython.display }\ImportTok{import}\NormalTok{ Markdown}
+\NormalTok{Markdown(}\SpecialStringTok{f"**Actual** = }\SpecialCharTok{\{}\NormalTok{actual\_barbie}\SpecialCharTok{:.4f\}}\SpecialStringTok{, **Sample** = }\SpecialCharTok{\{}\NormalTok{sample\_barbie}\SpecialCharTok{:.4f\}}\SpecialStringTok{, "}
+         \SpecialStringTok{f"**Err** = }\SpecialCharTok{\{}\DecValTok{100}\OperatorTok{*}\NormalTok{err}\SpecialCharTok{:.2f\}}\SpecialStringTok{\%."}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\textbf{Actual} = 0.5303, \textbf{Sample} = 0.5075, \textbf{Err} =
+4.30\%.
+
+We'll learn how to choose this number when we (re)learn the Central
+Limit Theorem later in the semester.
+
+\subsubsection{Quantifying Chance Error}\label{quantifying-chance-error}
+
+In our SRS of size 800, what would be our chance error?
+
+Let's simulate 1000 versions of taking the 800-sized SRS from before:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{nrep }\OperatorTok{=} \DecValTok{1000}   \CommentTok{\# number of simulations}
+\NormalTok{n }\OperatorTok{=} \DecValTok{800}       \CommentTok{\# size of our sample}
+\NormalTok{poll\_result }\OperatorTok{=}\NormalTok{ []}
+\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{0}\NormalTok{, nrep):}
+\NormalTok{    random\_sample }\OperatorTok{=}\NormalTok{ movie.sample(n, replace }\OperatorTok{=} \VariableTok{False}\NormalTok{)}
+\NormalTok{    poll\_result.append(np.mean(random\_sample[}\StringTok{"barbie"}\NormalTok{]))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{fig, ax }\OperatorTok{=}\NormalTok{ plt.subplots()}
+\NormalTok{sns.histplot(poll\_result, stat}\OperatorTok{=}\StringTok{\textquotesingle{}density\textquotesingle{}}\NormalTok{, ax}\OperatorTok{=}\NormalTok{ax)}
+\NormalTok{ax.axvline(actual\_barbie, color}\OperatorTok{=}\StringTok{"orange"}\NormalTok{, lw}\OperatorTok{=}\DecValTok{4}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{sampling/sampling_files/figure-pdf/cell-13-output-1.pdf}
+
+What fraction of these simulated samples would have predicted Barbie?
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{poll\_result }\OperatorTok{=}\NormalTok{ pd.Series(poll\_result)}
+\NormalTok{np.}\BuiltInTok{sum}\NormalTok{(poll\_result }\OperatorTok{\textgreater{}} \FloatTok{0.5}\NormalTok{)}\OperatorTok{/}\DecValTok{1000}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(0.95)
+\end{verbatim}
+
+You can see the curve looks roughly Gaussian/normal. Using KDE:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{sns.histplot(poll\_result, stat}\OperatorTok{=}\StringTok{\textquotesingle{}density\textquotesingle{}}\NormalTok{, kde}\OperatorTok{=}\VariableTok{True}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{sampling/sampling_files/figure-pdf/cell-15-output-1.pdf}
+
+\section{Summary}\label{summary-1}
+
+Understanding the sampling process is what lets us go from describing
+the data to understanding the world. Without knowing / assuming
+something about how the data were collected, there is no connection
+between the sample and the population. Ultimately, the dataset doesn't
+tell us about the world behind the data.
+
+\bookmarksetup{startatroot}
+
+\chapter{Introduction to Modeling}\label{introduction-to-modeling}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Understand what models are and how to carry out the four-step modeling
+  process.
+\item
+  Define the concept of loss and gain familiarity with \(L_1\) and
+  \(L_2\) loss.
+\item
+  Fit the Simple Linear Regression model using minimization techniques.
+\end{itemize}
+
+\end{tcolorbox}
+
+Up until this point in the semester, we've focused on analyzing
+datasets. We've looked into the early stages of the data science
+lifecycle, focusing on the programming tools, visualization techniques,
+and data cleaning methods needed for data analysis.
+
+This lecture marks a shift in focus. We will move away from examining
+datasets to actually \emph{using} our data to better understand the
+world. Specifically, the next sequence of lectures will explore
+predictive modeling: generating models to make some predictions about
+the world around us. In this lecture, we'll introduce the conceptual
+framework for setting up a modeling task. In the next few lectures,
+we'll put this framework into practice by implementing various kinds of
+models.
+
+\section{What is a Model?}\label{what-is-a-model}
+
+A model is an \textbf{idealized representation} of a system. A system is
+a set of principles or procedures according to which something
+functions. We live in a world full of systems: the procedure of turning
+on a light happens according to a specific set of rules dictating the
+flow of electricity. The truth behind how any event occurs is usually
+complex, and many times the specifics are unknown. The workings of the
+world can be viewed as its own giant procedure. Models seek to simplify
+the world and distill them into workable pieces.
+
+Example: We model the fall of an object on Earth as subject to a
+constant acceleration of \(9.81 m/s^2\) due to gravity.
+
+\begin{itemize}
+\tightlist
+\item
+  While this describes the behavior of our system, it is merely an
+  approximation.
+\item
+  It doesn't account for the effects of air resistance, local variations
+  in gravity, etc.
+\item
+  In practice, it's accurate enough to be useful!
+\end{itemize}
+
+\subsection{Reasons for Building
+Models}\label{reasons-for-building-models}
+
+Why do we want to build models? As far as data scientists and
+statisticians are concerned, there are three reasons, and each implies a
+different focus on modeling.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  To explain complex phenomena occurring in the world we live in.
+  Examples of this might be:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    How are the parents' average height related to their children's
+    average height?
+  \item
+    How does an object's velocity and acceleration impact how far it
+    travels? (Physics: \(d = d_0 + vt + \frac{1}{2}at^2\))
+  \end{itemize}
+
+  In these cases, we care about creating models that are \emph{simple
+  and interpretable}, allowing us to understand what the relationships
+  between our variables are.
+\item
+  To make accurate predictions about unseen data. Some examples include:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Can we predict if an email is spam or not?
+  \item
+    Can we generate a one-sentence summary of this 10-page long article?
+  \end{itemize}
+
+  When making predictions, we care more about making extremely accurate
+  predictions, at the cost of having an uninterpretable model. These are
+  sometimes called black-box models and are common in fields like deep
+  learning.
+\item
+  To measure the causal effects of one event on some other event. For
+  example,
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Does smoking \emph{cause} lung cancer?
+  \item
+    Does a job training program \emph{cause} increases in employment and
+    wages?
+  \end{itemize}
+
+  This is a much harder question because most statistical tools are
+  designed to infer association, not causation. We will not focus on
+  this task in Data 100, but you can take other advanced classes on
+  causal inference (e.g., Stat 156, Data 102) if you are intrigued!
+\end{enumerate}
+
+Most of the time, we aim to strike a balance between building
+\textbf{interpretable} models and building \textbf{accurate models}.
+
+\subsection{Common Types of Models}\label{common-types-of-models}
+
+In general, models can be split into two categories:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  Deterministic physical (mechanistic) models: Laws that govern how the
+  world works.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \href{https://en.wikipedia.org/wiki/Kepler\%27s_laws_of_planetary_motion\#Third_law}{Kepler's
+    Third Law of Planetary Motion (1619)}: The ratio of the square of an
+    object's orbital period with the cube of the semi-major axis of its
+    orbit is the same for all objects orbiting the same primary.
+
+    \begin{itemize}
+    \tightlist
+    \item
+      \(T^2 \propto R^3\)
+    \end{itemize}
+  \item
+    \href{https://en.wikipedia.org/wiki/Newton\%27s_laws_of_motion}{Newton's
+    Laws: motion and gravitation (1687)}: Newton's second law of motion
+    models the relationship between the mass of an object and the force
+    required to accelerate it.
+
+    \begin{itemize}
+    \tightlist
+    \item
+      \(F = ma\)
+    \item
+      \(F_g = G \frac{m_1 m_2}{r^2}\)
+    \end{itemize}
+  \end{itemize}
+\item
+  Probabilistic models: Models that attempt to understand how random
+  processes evolve. These are more general and can be used to describe
+  many phenomena in the real world. These models commonly make
+  simplifying assumptions about the nature of the world.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \href{https://en.wikipedia.org/wiki/Poisson_point_process}{Poisson
+    Process models}: Used to model random events that happen with some
+    probability at any point in time and are strictly increasing in
+    count, such as the arrival of customers at a store.
+  \end{itemize}
+\end{enumerate}
+
+Note: These specific models are not in the scope of Data 100 and exist
+to serve as motivation.
+
+\section{Simple Linear Regression}\label{simple-linear-regression}
+
+The \textbf{regression line} is the unique straight line that minimizes
+the \textbf{mean squared error} of estimation among all straight lines.
+As with any straight line, it can be defined by a slope and a
+y-intercept:
+
+\begin{itemize}
+\tightlist
+\item
+  \(\text{slope} = r \cdot \frac{\text{Standard Deviation of } y}{\text{Standard Deviation of }x}\)
+\item
+  \(y\text{-intercept} = \text{average of }y - \text{slope}\cdot\text{average of }x\)
+\item
+  \(\text{regression estimate} = y\text{-intercept} + \text{slope}\cdot\text{}x\)
+\item
+  \(\text{residual} =\text{observed }y - \text{regression estimate}\)
+\end{itemize}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\CommentTok{\# Set random seed for consistency }
+\NormalTok{np.random.seed(}\DecValTok{43}\NormalTok{)}
+\NormalTok{plt.style.use(}\StringTok{\textquotesingle{}default\textquotesingle{}}\NormalTok{) }
+
+\CommentTok{\#Generate random noise for plotting}
+\NormalTok{x }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{100}\NormalTok{)}
+\NormalTok{y }\OperatorTok{=}\NormalTok{ x }\OperatorTok{*} \FloatTok{0.5} \OperatorTok{{-}} \DecValTok{1} \OperatorTok{+}\NormalTok{ np.random.randn(}\DecValTok{100}\NormalTok{) }\OperatorTok{*} \FloatTok{0.3}
+
+\CommentTok{\#plot regression line}
+\NormalTok{sns.regplot(x}\OperatorTok{=}\NormalTok{x,y}\OperatorTok{=}\NormalTok{y)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{intro_to_modeling/intro_to_modeling_files/figure-pdf/cell-2-output-1.pdf}
+
+\subsection{Notations and Definitions}\label{notations-and-definitions}
+
+For a pair of variables \(x\) and \(y\) representing our data
+\(\mathcal{D} = \{(x_1, y_1), (x_2, y_2), \dots, (x_n, y_n)\}\), we
+denote their means/averages as \(\bar x\) and \(\bar y\) and standard
+deviations as \(\sigma_x\) and \(\sigma_y\).
+
+\subsubsection{Standard Units}\label{standard-units}
+
+A variable is represented in standard units if the following are true:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  0 in standard units is equal to the mean (\(\bar{x}\)) in the original
+  variable's units.
+\item
+  An increase of 1 standard unit is an increase of 1 standard deviation
+  (\(\sigma_x\)) in the original variable's units.
+\end{enumerate}
+
+To convert a variable \(x_i\) into standard units, we subtract its mean
+from it and divide it by its standard deviation. For example, \(x_i\) in
+standard units is \(\frac{x_i - \bar x}{\sigma_x}\).
+
+\subsubsection{Correlation}\label{correlation}
+
+The correlation (\(r\)) is the average of the product of \(x\) and
+\(y\), both measured in \emph{standard units}.
+
+\[r = \frac{1}{n} \sum_{i=1}^n (\frac{x_i - \bar{x}}{\sigma_x})(\frac{y_i - \bar{y}}{\sigma_y})\]
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Correlation measures the strength of a \textbf{linear association}
+  between two variables.
+\item
+  Correlations range between -1 and 1: \(|r| \leq 1\), with \(r=1\)
+  indicating perfect linear association, and \(r=-1\) indicating perfect
+  negative association. The closer \(r\) is to \(0\), the weaker the
+  linear association is.
+\item
+  Correlation says nothing about causation and non-linear association.
+  Correlation does \textbf{not} imply causation. When \(r = 0\), the two
+  variables are uncorrelated. However, they could still be related
+  through some non-linear relationship.
+\end{enumerate}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ plot\_and\_get\_corr(ax, x, y, title):}
+\NormalTok{    ax.set\_xlim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{)}
+\NormalTok{    ax.set\_ylim(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{)}
+\NormalTok{    ax.set\_xticks([])}
+\NormalTok{    ax.set\_yticks([])}
+\NormalTok{    ax.scatter(x, y, alpha }\OperatorTok{=} \FloatTok{0.73}\NormalTok{)}
+\NormalTok{    r }\OperatorTok{=}\NormalTok{ np.corrcoef(x, y)[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{]}
+\NormalTok{    ax.set\_title(title }\OperatorTok{+} \StringTok{" (corr: }\SpecialCharTok{\{\}}\StringTok{)"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(r.}\BuiltInTok{round}\NormalTok{(}\DecValTok{2}\NormalTok{)))}
+    \ControlFlowTok{return}\NormalTok{ r}
+
+\NormalTok{fig, axs }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{2}\NormalTok{, }\DecValTok{2}\NormalTok{, figsize }\OperatorTok{=}\NormalTok{ (}\DecValTok{10}\NormalTok{, }\DecValTok{10}\NormalTok{))}
+
+\CommentTok{\# Just noise}
+\NormalTok{x1, y1 }\OperatorTok{=}\NormalTok{ np.random.randn(}\DecValTok{2}\NormalTok{, }\DecValTok{100}\NormalTok{)}
+\NormalTok{corr1 }\OperatorTok{=}\NormalTok{ plot\_and\_get\_corr(axs[}\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{], x1, y1, title }\OperatorTok{=} \StringTok{"noise"}\NormalTok{)}
+
+\CommentTok{\# Strong linear}
+\NormalTok{x2 }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{100}\NormalTok{)}
+\NormalTok{y2 }\OperatorTok{=}\NormalTok{ x2 }\OperatorTok{*} \FloatTok{0.5} \OperatorTok{{-}} \DecValTok{1} \OperatorTok{+}\NormalTok{ np.random.randn(}\DecValTok{100}\NormalTok{) }\OperatorTok{*} \FloatTok{0.3}
+\NormalTok{corr2 }\OperatorTok{=}\NormalTok{ plot\_and\_get\_corr(axs[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{], x2, y2, title }\OperatorTok{=} \StringTok{"strong linear"}\NormalTok{)}
+
+\CommentTok{\# Unequal spread}
+\NormalTok{x3 }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{100}\NormalTok{)}
+\NormalTok{y3 }\OperatorTok{=} \OperatorTok{{-}}\NormalTok{ x3}\OperatorTok{/}\DecValTok{3} \OperatorTok{+}\NormalTok{ np.random.randn(}\DecValTok{100}\NormalTok{)}\OperatorTok{*}\NormalTok{(x3)}\OperatorTok{/}\FloatTok{2.5}
+\NormalTok{corr3 }\OperatorTok{=}\NormalTok{ plot\_and\_get\_corr(axs[}\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{], x3, y3, title }\OperatorTok{=} \StringTok{"strong linear"}\NormalTok{)}
+\NormalTok{extent }\OperatorTok{=}\NormalTok{ axs[}\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{].get\_window\_extent().transformed(fig.dpi\_scale\_trans.inverted())}
+
+\CommentTok{\# Strong non{-}linear}
+\NormalTok{x4 }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{100}\NormalTok{)}
+\NormalTok{y4 }\OperatorTok{=} \DecValTok{2}\OperatorTok{*}\NormalTok{np.sin(x3 }\OperatorTok{{-}} \FloatTok{1.5}\NormalTok{) }\OperatorTok{+}\NormalTok{ np.random.randn(}\DecValTok{100}\NormalTok{) }\OperatorTok{*} \FloatTok{0.3}
+\NormalTok{corr4 }\OperatorTok{=}\NormalTok{ plot\_and\_get\_corr(axs[}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{], x4, y4, title }\OperatorTok{=} \StringTok{"strong non{-}linear"}\NormalTok{)}
+
+\NormalTok{plt.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{intro_to_modeling/intro_to_modeling_files/figure-pdf/cell-3-output-1.pdf}
+
+\subsection{Alternate Form}\label{alternate-form}
+
+When the variables \(y\) and \(x\) are measured in \emph{standard
+units}, the regression line for predicting \(y\) based on \(x\) has
+slope \(r\) and passes through the origin.
+
+\[\hat{y}_{su} = r \cdot x_{su}\]
+
+\includegraphics{intro_to_modeling/images/reg_line_1.png}
+
+\begin{itemize}
+\tightlist
+\item
+  In the original units, this becomes
+\end{itemize}
+
+\[\frac{\hat{y} - \bar{y}}{\sigma_y} = r \cdot \frac{x - \bar{x}}{\sigma_x}\]
+
+\includegraphics{intro_to_modeling/images/reg_line_2.png}
+
+\subsection{Derivation}\label{derivation}
+
+Starting from the top, we have our claimed form of the regression line,
+and we want to show that it is equivalent to the optimal linear
+regression line: \(\hat{y} = \hat{a} + \hat{b}x\).
+
+Recall:
+
+\begin{itemize}
+\tightlist
+\item
+  \(\hat{b} = r \cdot \frac{\text{Standard Deviation of }y}{\text{Standard Deviation of }x}\)
+\item
+  \(\hat{a} = \text{average of }y - \text{slope}\cdot\text{average of }x\)
+\end{itemize}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-color-frame, left=2mm, breakable, rightrule=.15mm, bottomrule=.15mm, opacityback=0, toprule=.15mm, leftrule=.75mm, arc=.35mm, colback=white]
+
+Proof:
+
+\[\frac{\hat{y} - \bar{y}}{\sigma_y} = r \cdot \frac{x - \bar{x}}{\sigma_x}\]
+
+Multiply by \(\sigma_y\), and add \(\bar{y}\) on both sides.
+
+\[\hat{y} = \sigma_y \cdot r \cdot \frac{x - \bar{x}}{\sigma_x} + \bar{y}\]
+
+Distribute coefficient \(\sigma_{y}\cdot r\) to the
+\(\frac{x - \bar{x}}{\sigma_x}\) term
+
+\[\hat{y} = (\frac{r\sigma_y}{\sigma_x} ) \cdot x + (\bar{y} - (\frac{r\sigma_y}{\sigma_x} ) \bar{x})\]
+
+We now see that we have a line that matches our claim:
+
+\begin{itemize}
+\tightlist
+\item
+  slope:
+  \(r\cdot\frac{\text{SD of y}}{\text{SD of x}} = r\cdot\frac{\sigma_y}{\sigma_x}\)
+\item
+  intercept: \(\bar{y} - \text{slope}\cdot \bar{x}\)
+\end{itemize}
+
+Note that the error for the i-th datapoint is: \(e_i = y_i - \hat{y_i}\)
+
+\end{tcolorbox}
+
+\section{The Modeling Process}\label{the-modeling-process}
+
+At a high level, a model is a way of representing a system. In Data 100,
+we'll treat a model as some mathematical rule we use to describe the
+relationship between variables.
+
+What variables are we modeling? Typically, we use a subset of the
+variables in our sample of collected data to model another variable in
+this data. To put this more formally, say we have the following dataset
+\(\mathcal{D}\):
+
+\[\mathcal{D} = \{(x_1, y_1), (x_2, y_2), ..., (x_n, y_n)\}\]
+
+Each pair of values \((x_i, y_i)\) represents a datapoint. In a modeling
+setting, we call these \textbf{observations}. \(y_i\) is the dependent
+variable we are trying to model, also called an \textbf{output} or
+\textbf{response}. \(x_i\) is the independent variable inputted into the
+model to make predictions, also known as a \textbf{feature}.
+
+Our goal in modeling is to use the observed data \(\mathcal{D}\) to
+predict the output variable \(y_i\). We denote each prediction as
+\(\hat{y}_i\) (read: ``y hat sub i'').
+
+How do we generate these predictions? Some examples of models we'll
+encounter in the next few lectures are given below:
+
+\[\hat{y}_i = \theta\] \[\hat{y}_i = \theta_0 + \theta_1 x_i\]
+
+The examples above are known as \textbf{parametric models}. They relate
+the collected data, \(x_i\), to the prediction we make, \(\hat{y}_i\). A
+few parameters (\(\theta\), \(\theta_0\), \(\theta_1\)) are used to
+describe the relationship between \(x_i\) and \(\hat{y}_i\).
+
+Notice that we don't immediately know the values of these parameters.
+While the features, \(x_i\), are taken from our observed data, we need
+to decide what values to give \(\theta\), \(\theta_0\), and \(\theta_1\)
+ourselves. This is the heart of parametric modeling: \emph{what
+parameter values should we choose so our model makes the best possible
+predictions?}
+
+To choose our model parameters, we'll work through the \textbf{modeling
+process}.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Choose a model: how should we represent the world?
+\item
+  Choose a loss function: how do we quantify prediction error?
+\item
+  Fit the model: how do we choose the best parameters of our model given
+  our data?
+\item
+  Evaluate model performance: how do we evaluate whether this process
+  gave rise to a good model?
+\end{enumerate}
+
+\section{Choosing a Model}\label{choosing-a-model}
+
+Our first step is choosing a model: defining the mathematical rule that
+describes the relationship between the features, \(x_i\), and
+predictions \(\hat{y}_i\).
+
+In
+\href{https://inferentialthinking.com/chapters/15/4/Least_Squares_Regression.html}{Data
+8}, you learned about the \textbf{Simple Linear Regression (SLR) model}.
+You learned that the model takes the form: \[\hat{y}_i = a + bx_i\]
+
+In Data 100, we'll use slightly different notation: we will replace
+\(a\) with \(\theta_0\) and \(b\) with \(\theta_1\). This will allow us
+to use the same notation when we explore more complex models later on in
+the course.
+
+\[\hat{y}_i = \theta_0 + \theta_1 x_i\]
+
+The parameters of the SLR model are \(\theta_0\), also called the
+intercept term, and \(\theta_1\), also called the slope term. To create
+an effective model, we want to choose values for \(\theta_0\) and
+\(\theta_1\) that most accurately predict the output variable. The
+``best'' fitting model parameters are given the special names:
+\(\hat{\theta}_0\) and \(\hat{\theta}_1\); they are the specific
+parameter values that allow our model to generate the best possible
+predictions.
+
+In Data 8, you learned that the best SLR model parameters are:
+\[\hat{\theta}_0 = \bar{y} - \hat{\theta}_1\bar{x} \qquad \qquad \hat{\theta}_1 = r \frac{\sigma_y}{\sigma_x}\]
+
+A quick reminder on notation:
+
+\begin{itemize}
+\tightlist
+\item
+  \(\bar{y}\) and \(\bar{x}\) indicate the mean value of \(y\) and
+  \(x\), respectively
+\item
+  \(\sigma_y\) and \(\sigma_x\) indicate the standard deviations of
+  \(y\) and \(x\)
+\item
+  \(r\) is the
+  \href{https://inferentialthinking.com/chapters/15/1/Correlation.html\#the-correlation-coefficient}{correlation
+  coefficient}, defined as the average of the product of \(x\) and \(y\)
+  measured in standard units:
+  \(\frac{1}{n} \sum_{i=1}^n (\frac{x_i-\bar{x}}{\sigma_x})(\frac{y_i-\bar{y}}{\sigma_y})\)
+\end{itemize}
+
+In Data 100, we want to understand \emph{how} to derive these best model
+coefficients. To do so, we'll introduce the concept of a loss function.
+
+\section{Choosing a Loss Function}\label{choosing-a-loss-function}
+
+We've talked about the idea of creating the ``best'' possible
+predictions. This begs the question: how do we decide how ``good'' or
+``bad'' our model's predictions are?
+
+A \textbf{loss function} characterizes the cost, error, or fit resulting
+from a particular choice of model or model parameters. This function,
+\(L(y, \hat{y})\), quantifies how ``bad'' or ``far off'' a single
+prediction by our model is from a true, observed value in our collected
+data.
+
+The choice of loss function for a particular model will affect the
+accuracy and computational cost of estimation, and it'll also depend on
+the estimation task at hand. For example,
+
+\begin{itemize}
+\tightlist
+\item
+  Are outputs quantitative or qualitative?
+\item
+  Do outliers matter?
+\item
+  Are all errors equally costly? (e.g., a false negative on a cancer
+  test is arguably more dangerous than a false positive)
+\end{itemize}
+
+Regardless of the specific function used, a loss function should follow
+two basic principles:
+
+\begin{itemize}
+\tightlist
+\item
+  If the prediction \(\hat{y}_i\) is \emph{close} to the actual value
+  \(y_i\), loss should be low.
+\item
+  If the prediction \(\hat{y}_i\) is \emph{far} from the actual value
+  \(y_i\), loss should be high.
+\end{itemize}
+
+Two common choices of loss function are squared loss and absolute loss.
+
+\textbf{Squared loss}, also known as \textbf{L2 loss}, computes loss as
+the square of the difference between the observed \(y_i\) and predicted
+\(\hat{y}_i\): \[L(y_i, \hat{y}_i) = (y_i - \hat{y}_i)^2\]
+
+\textbf{Absolute loss}, also known as \textbf{L1 loss}, computes loss as
+the absolute difference between the observed \(y_i\) and predicted
+\(\hat{y}_i\): \[L(y_i, \hat{y}_i) = |y_i - \hat{y}_i|\]
+
+L1 and L2 loss give us a tool for quantifying our model's performance on
+a single data point. This is a good start, but ideally, we want to
+understand how our model performs across our \emph{entire} dataset. A
+natural way to do this is to compute the average loss across all data
+points in the dataset. This is known as the \textbf{cost function},
+\(\hat{R}(\theta)\):
+\[\hat{R}(\theta) = \frac{1}{n} \sum^n_{i=1} L(y_i, \hat{y}_i)\]
+
+The cost function has many names in the statistics literature. You may
+also encounter the terms:
+
+\begin{itemize}
+\tightlist
+\item
+  Empirical risk (this is why we give the cost function the name \(R\))
+\item
+  Error function
+\item
+  Average loss
+\end{itemize}
+
+We can substitute our L1 and L2 loss into the cost function definition.
+The \textbf{Mean Squared Error (MSE)} is the average squared loss across
+a dataset: \[\text{MSE} = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2\]
+
+The \textbf{Mean Absolute Error (MAE)} is the average absolute loss
+across a dataset:
+\[\text{MAE}= \frac{1}{n} \sum_{i=1}^n |y_i - \hat{y}_i|\]
+
+\section{Fitting the Model}\label{fitting-the-model}
+
+Now that we've established the concept of a loss function, we can return
+to our original goal of choosing model parameters. Specifically, we want
+to choose the best set of model parameters that will minimize the
+model's cost on our dataset. This process is called fitting the model.
+
+We know from calculus that a function is minimized when (1) its first
+derivative is equal to zero and (2) its second derivative is positive.
+We often call the function being minimized the \textbf{objective
+function} (our objective is to find its minimum).
+
+To find the optimal model parameter, we:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Take the derivative of the cost function with respect to that
+  parameter
+\item
+  Set the derivative equal to 0
+\item
+  Solve for the parameter
+\end{enumerate}
+
+We repeat this process for each parameter present in the model. For now,
+we'll disregard the second derivative condition.
+
+To help us make sense of this process, let's put it into action by
+deriving the optimal model parameters for simple linear regression using
+the mean squared error as our cost function. Remember: although the
+notation may look tricky, all we are doing is following the three steps
+above!
+
+Step 1: take the derivative of the cost function with respect to each
+model parameter. We substitute the SLR model,
+\(\hat{y}_i = \theta_0+\theta_1 x_i\), into the definition of MSE above
+and differentiate with respect to \(\theta_0\) and \(\theta_1\).
+\[\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 = \frac{1}{n} \sum_{i=1}^{n} (y_i - \theta_0 - \theta_1 x_i)^2\]
+
+\[\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n} y_i - \theta_0 - \theta_1 x_i\]
+
+\[\frac{\partial}{\partial \theta_1} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n} (y_i - \theta_0 - \theta_1 x_i)x_i\]
+
+Let's walk through these derivations in more depth, starting with the
+derivative of MSE with respect to \(\theta_0\).
+
+Given our MSE above, we know that:
+\[\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{\partial}{\partial \theta_0} \frac{1}{n} \sum_{i=1}^{n} {(y_i - \theta_0 - \theta_1 x_i)}^{2}\]
+
+Noting that the derivative of sum is equivalent to the sum of
+derivatives, this then becomes:
+\[ = \frac{1}{n} \sum_{i=1}^{n} \frac{\partial}{\partial \theta_0} {(y_i - \theta_0 - \theta_1 x_i)}^{2}\]
+
+We can then apply the chain rule.
+
+\[ = \frac{1}{n} \sum_{i=1}^{n} 2 \cdot{(y_i - \theta_0 - \theta_1 x_i)}\dot(-1)\]
+
+Finally, we can simplify the constants, leaving us with our answer.
+
+\[\frac{\partial}{\partial \theta_0} \text{MSE} = \frac{-2}{n} \sum_{i=1}^{n}{(y_i - \theta_0 - \theta_1 x_i)}\]
+
+Following the same procedure, we can take the derivative of MSE with
+respect to \(\theta_1\).
+
+\[\frac{\partial}{\partial \theta_1} \text{MSE} = \frac{\partial}{\partial \theta_1} \frac{1}{n} \sum_{i=1}^{n} {(y_i - \theta_0 - \theta_1 x_i)}^{2}\]
+
+\[ = \frac{1}{n} \sum_{i=1}^{n} \frac{\partial}{\partial \theta_1} {(y_i - \theta_0 - \theta_1 x_i)}^{2}\]
+
+\[ = \frac{1}{n} \sum_{i=1}^{n} 2 \dot{(y_i - \theta_0 - \theta_1 x_i)}\dot(-x_i)\]
+
+\[= \frac{-2}{n} \sum_{i=1}^{n} {(y_i - \theta_0 - \theta_1 x_i)}x_i\]
+
+Step 2: set the derivatives equal to 0. After simplifying terms, this
+produces two \textbf{estimating equations}. The best set of model
+parameters \((\hat{\theta}_0, \hat{\theta}_1)\) \emph{must} satisfy
+these two optimality conditions.
+\[0 = \frac{-2}{n} \sum_{i=1}^{n} y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i \Longleftrightarrow \frac{1}{n}\sum_{i=1}^{n} y_i - \hat{y}_i = 0\]
+\[0 = \frac{-2}{n} \sum_{i=1}^{n} (y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i)x_i \Longleftrightarrow \frac{1}{n}\sum_{i=1}^{n} (y_i - \hat{y}_i)x_i = 0\]
+
+Step 3: solve the estimating equations to compute estimates for
+\(\hat{\theta}_0\) and \(\hat{\theta}_1\).
+
+Taking the first equation gives the estimate of \(\hat{\theta}_0\):
+\[\frac{1}{n} \sum_{i=1}^n y_i - \hat{\theta}_0 - \hat{\theta}_1 x_i = 0 \]
+
+\[\left(\frac{1}{n} \sum_{i=1}^n y_i \right) - \hat{\theta}_0 - \hat{\theta}_1\left(\frac{1}{n} \sum_{i=1}^n x_i \right) = 0\]
+
+\[ \hat{\theta}_0 = \bar{y} - \hat{\theta}_1 \bar{x}\]
+
+With a bit more maneuvering, the second equation gives the estimate of
+\(\hat{\theta}_1\). Start by multiplying the first estimating equation
+by \(\bar{x}\), then subtracting the result from the second estimating
+equation.
+
+\[\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)x_i - \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)\bar{x} = 0 \]
+
+\[\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)(x_i - \bar{x}) = 0 \]
+
+Next, plug in
+\(\hat{y}_i = \hat{\theta}_0 + \hat{\theta}_1 x_i = \bar{y} + \hat{\theta}_1(x_i - \bar{x})\):
+
+\[\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y} - \hat{\theta}_1(x - \bar{x}))(x_i - \bar{x}) = 0 \]
+
+\[\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y})(x_i - \bar{x}) = \hat{\theta}_1 \times \frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2
+\]
+
+By using the definition of correlation
+\(\left(r = \frac{1}{n} \sum_{i=1}^n (\frac{x_i-\bar{x}}{\sigma_x})(\frac{y_i-\bar{y}}{\sigma_y}) \right)\)
+and standard deviation
+\(\left(\sigma_x = \sqrt{\frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2} \right)\),
+we can conclude:
+\[r \sigma_x \sigma_y = \hat{\theta}_1 \times \sigma_x^2\]
+\[\hat{\theta}_1 = r \frac{\sigma_y}{\sigma_x}\]
+
+Just as was given in Data 8!
+
+Remember, this derivation found the optimal model parameters for SLR
+when using the MSE cost function. If we had used a different model or
+different loss function, we likely would have found different values for
+the best model parameters. However, regardless of the model and loss
+used, we can \emph{always} follow these three steps to fit the model.
+
+\bookmarksetup{startatroot}
+
+\chapter{Constant Model, Loss, and
+Transformations}\label{constant-model-loss-and-transformations}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Derive the optimal model parameters for the constant model under MSE
+  and MAE cost functions.
+\item
+  Evaluate the differences between MSE and MAE risk.
+\item
+  Understand the need for linearization of variables and apply the
+  Tukey-Mosteller bulge diagram for transformations.
+\end{itemize}
+
+\end{tcolorbox}
+
+Last time, we introduced the modeling process. We set up a framework to
+predict target variables as functions of our features, following a set
+workflow:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Choose a model - how should we represent the world?
+\item
+  Choose a loss function - how do we quantify prediction error?
+\item
+  Fit the model - how do we choose the best parameter of our model given
+  our data?
+\item
+  Evaluate model performance - how do we evaluate whether this process
+  gave rise to a good model?
+\end{enumerate}
+
+To illustrate this process, we derived the optimal model parameters
+under simple linear regression (SLR) with mean squared error (MSE) as
+the cost function. A summary of the SLR modeling process is shown below:
+
+In this lecture, we'll dive deeper into step 4 - evaluating model
+performance - using SLR as an example. Additionally, we'll also explore
+the modeling process with new models, continue familiarizing ourselves
+with the modeling process by finding the best model parameters under a
+new model, the constant model, and test out two different loss functions
+to understand how our choice of loss influences model design. Later on,
+we'll consider what happens when a linear model isn't the best choice to
+capture trends in our data and what solutions there are to create better
+models.
+
+Before we get into Step 4, let's quickly review some important
+terminology.
+
+\subsection{Prediction vs.~Estimation}\label{prediction-vs.-estimation}
+
+The terms prediction and estimation are often used somewhat
+interchangeably, but there is a subtle difference between them.
+\textbf{Estimation} is the task of using data to calculate model
+parameters. \textbf{Prediction} is the task of using a model to predict
+outputs for unseen data. In our simple linear regression model,
+
+\[\hat{y} = \hat{\theta_0} + \hat{\theta_1}\]
+
+we \textbf{estimate} the parameters by minimizing average loss; then, we
+\textbf{predict} using these estimations. \textbf{Least Squares
+Estimation} is when we choose the parameters that minimize MSE.
+
+\section{Step 4: Evaluating the SLR
+Model}\label{step-4-evaluating-the-slr-model}
+
+Now that we've explored the mathematics behind (1) choosing a model, (2)
+choosing a loss function, and (3) fitting the model, we're left with one
+final question -- how ``good'' are the predictions made by this ``best''
+fitted model? To determine this, we can:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  Visualize data and compute statistics:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Plot the original data.
+  \item
+    Compute each column's mean and standard deviation. If the mean and
+    standard deviation of our predictions are close to those of the
+    original observed \(y_i\)'s, we might be inclined to say that our
+    model has done well.
+  \item
+    (If we're fitting a linear model) Compute the correlation \(r\). A
+    large magnitude for the correlation coefficient between the feature
+    and response variables could also indicate that our model has done
+    well.
+  \end{itemize}
+\item
+  Performance metrics:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    We can take the \textbf{Root Mean Squared Error (RMSE)}.
+
+    \begin{itemize}
+    \tightlist
+    \item
+      It's the square root of the mean squared error (MSE), which is the
+      average loss that we've been minimizing to determine optimal model
+      parameters.
+    \item
+      RMSE is in the same units as \(y\).
+    \item
+      A lower RMSE indicates more ``accurate'' predictions, as we have a
+      lower ``average loss'' across the data.
+    \end{itemize}
+  \end{itemize}
+
+  \[\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2}\]
+\item
+  Visualization:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Look at the residual plot of \(e_i = y_i - \hat{y_i}\) to visualize
+    the difference between actual and predicted values. The good
+    residual plot should not show any pattern between input/features
+    \(x_i\) and residual values \(e_i\).
+  \end{itemize}
+\end{enumerate}
+
+To illustrate this process, let's take a look at \textbf{Anscombe's
+quartet}.
+
+\subsection{Four Mysterious Datasets (Anscombe's
+quartet)}\label{four-mysterious-datasets-anscombes-quartet}
+
+Let's take a look at four different datasets.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+\OperatorTok{\%}\NormalTok{matplotlib inline}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\ImportTok{import}\NormalTok{ itertools}
+\ImportTok{from}\NormalTok{ mpl\_toolkits.mplot3d }\ImportTok{import}\NormalTok{ Axes3D}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Big font helper}
+\KeywordTok{def}\NormalTok{ adjust\_fontsize(size}\OperatorTok{=}\VariableTok{None}\NormalTok{):}
+\NormalTok{    SMALL\_SIZE }\OperatorTok{=} \DecValTok{8}
+\NormalTok{    MEDIUM\_SIZE }\OperatorTok{=} \DecValTok{10}
+\NormalTok{    BIGGER\_SIZE }\OperatorTok{=} \DecValTok{12}
+    \ControlFlowTok{if}\NormalTok{ size }\OperatorTok{!=} \VariableTok{None}\NormalTok{:}
+\NormalTok{        SMALL\_SIZE }\OperatorTok{=}\NormalTok{ MEDIUM\_SIZE }\OperatorTok{=}\NormalTok{ BIGGER\_SIZE }\OperatorTok{=}\NormalTok{ size}
+
+\NormalTok{    plt.rc(}\StringTok{"font"}\NormalTok{, size}\OperatorTok{=}\NormalTok{SMALL\_SIZE)  }\CommentTok{\# controls default text sizes}
+\NormalTok{    plt.rc(}\StringTok{"axes"}\NormalTok{, titlesize}\OperatorTok{=}\NormalTok{SMALL\_SIZE)  }\CommentTok{\# fontsize of the axes title}
+\NormalTok{    plt.rc(}\StringTok{"axes"}\NormalTok{, labelsize}\OperatorTok{=}\NormalTok{MEDIUM\_SIZE)  }\CommentTok{\# fontsize of the x and y labels}
+\NormalTok{    plt.rc(}\StringTok{"xtick"}\NormalTok{, labelsize}\OperatorTok{=}\NormalTok{SMALL\_SIZE)  }\CommentTok{\# fontsize of the tick labels}
+\NormalTok{    plt.rc(}\StringTok{"ytick"}\NormalTok{, labelsize}\OperatorTok{=}\NormalTok{SMALL\_SIZE)  }\CommentTok{\# fontsize of the tick labels}
+\NormalTok{    plt.rc(}\StringTok{"legend"}\NormalTok{, fontsize}\OperatorTok{=}\NormalTok{SMALL\_SIZE)  }\CommentTok{\# legend fontsize}
+\NormalTok{    plt.rc(}\StringTok{"figure"}\NormalTok{, titlesize}\OperatorTok{=}\NormalTok{BIGGER\_SIZE)  }\CommentTok{\# fontsize of the figure title}
+
+
+\CommentTok{\# Helper functions}
+\KeywordTok{def}\NormalTok{ standard\_units(x):}
+    \ControlFlowTok{return}\NormalTok{ (x }\OperatorTok{{-}}\NormalTok{ np.mean(x)) }\OperatorTok{/}\NormalTok{ np.std(x)}
+
+
+\KeywordTok{def}\NormalTok{ correlation(x, y):}
+    \ControlFlowTok{return}\NormalTok{ np.mean(standard\_units(x) }\OperatorTok{*}\NormalTok{ standard\_units(y))}
+
+
+\KeywordTok{def}\NormalTok{ slope(x, y):}
+    \ControlFlowTok{return}\NormalTok{ correlation(x, y) }\OperatorTok{*}\NormalTok{ np.std(y) }\OperatorTok{/}\NormalTok{ np.std(x)}
+
+
+\KeywordTok{def}\NormalTok{ intercept(x, y):}
+    \ControlFlowTok{return}\NormalTok{ np.mean(y) }\OperatorTok{{-}}\NormalTok{ slope(x, y) }\OperatorTok{*}\NormalTok{ np.mean(x)}
+
+
+\KeywordTok{def}\NormalTok{ fit\_least\_squares(x, y):}
+\NormalTok{    theta\_0 }\OperatorTok{=}\NormalTok{ intercept(x, y)}
+\NormalTok{    theta\_1 }\OperatorTok{=}\NormalTok{ slope(x, y)}
+    \ControlFlowTok{return}\NormalTok{ theta\_0, theta\_1}
+
+
+\KeywordTok{def}\NormalTok{ predict(x, theta\_0, theta\_1):}
+    \ControlFlowTok{return}\NormalTok{ theta\_0 }\OperatorTok{+}\NormalTok{ theta\_1 }\OperatorTok{*}\NormalTok{ x}
+
+
+\KeywordTok{def}\NormalTok{ compute\_mse(y, yhat):}
+    \ControlFlowTok{return}\NormalTok{ np.mean((y }\OperatorTok{{-}}\NormalTok{ yhat) }\OperatorTok{**} \DecValTok{2}\NormalTok{)}
+
+
+\NormalTok{plt.style.use(}\StringTok{"default"}\NormalTok{)  }\CommentTok{\# Revert style to default mpl}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{plt.style.use(}\StringTok{"default"}\NormalTok{)  }\CommentTok{\# Revert style to default mpl}
+\NormalTok{NO\_VIZ, RESID, RESID\_SCATTER }\OperatorTok{=} \BuiltInTok{range}\NormalTok{(}\DecValTok{3}\NormalTok{)}
+
+
+\KeywordTok{def}\NormalTok{ least\_squares\_evaluation(x, y, visualize}\OperatorTok{=}\NormalTok{NO\_VIZ):}
+    \CommentTok{\# statistics}
+    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"x\_mean : }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{mean(x)}\SpecialCharTok{:.2f\}}\SpecialStringTok{, y\_mean : }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{mean(y)}\SpecialCharTok{:.2f\}}\SpecialStringTok{"}\NormalTok{)}
+    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"x\_stdev: }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{std(x)}\SpecialCharTok{:.2f\}}\SpecialStringTok{, y\_stdev: }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{std(y)}\SpecialCharTok{:.2f\}}\SpecialStringTok{"}\NormalTok{)}
+    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"r = Correlation(x, y): }\SpecialCharTok{\{}\NormalTok{correlation(x, y)}\SpecialCharTok{:.3f\}}\SpecialStringTok{"}\NormalTok{)}
+
+    \CommentTok{\# Performance metrics}
+\NormalTok{    ahat, bhat }\OperatorTok{=}\NormalTok{ fit\_least\_squares(x, y)}
+\NormalTok{    yhat }\OperatorTok{=}\NormalTok{ predict(x, ahat, bhat)}
+    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"}\CharTok{\textbackslash{}t}\SpecialStringTok{heta\_0: }\SpecialCharTok{\{}\NormalTok{ahat}\SpecialCharTok{:.2f\}}\SpecialStringTok{, }\CharTok{\textbackslash{}t}\SpecialStringTok{heta\_1: }\SpecialCharTok{\{}\NormalTok{bhat}\SpecialCharTok{:.2f\}}\SpecialStringTok{"}\NormalTok{)}
+    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"RMSE: }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{sqrt(compute\_mse(y, yhat))}\SpecialCharTok{:.3f\}}\SpecialStringTok{"}\NormalTok{)}
+
+    \CommentTok{\# visualization}
+\NormalTok{    fig, ax\_resid }\OperatorTok{=} \VariableTok{None}\NormalTok{, }\VariableTok{None}
+    \ControlFlowTok{if}\NormalTok{ visualize }\OperatorTok{==}\NormalTok{ RESID\_SCATTER:}
+\NormalTok{        fig, axs }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{8}\NormalTok{, }\DecValTok{3}\NormalTok{))}
+\NormalTok{        axs[}\DecValTok{0}\NormalTok{].scatter(x, y)}
+\NormalTok{        axs[}\DecValTok{0}\NormalTok{].plot(x, yhat)}
+\NormalTok{        axs[}\DecValTok{0}\NormalTok{].set\_title(}\StringTok{"LS fit"}\NormalTok{)}
+\NormalTok{        ax\_resid }\OperatorTok{=}\NormalTok{ axs[}\DecValTok{1}\NormalTok{]}
+    \ControlFlowTok{elif}\NormalTok{ visualize }\OperatorTok{==}\NormalTok{ RESID:}
+\NormalTok{        fig }\OperatorTok{=}\NormalTok{ plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{4}\NormalTok{, }\DecValTok{3}\NormalTok{))}
+\NormalTok{        ax\_resid }\OperatorTok{=}\NormalTok{ plt.gca()}
+
+    \ControlFlowTok{if}\NormalTok{ ax\_resid }\KeywordTok{is} \KeywordTok{not} \VariableTok{None}\NormalTok{:}
+\NormalTok{        ax\_resid.scatter(x, y }\OperatorTok{{-}}\NormalTok{ yhat, color}\OperatorTok{=}\StringTok{"red"}\NormalTok{)}
+\NormalTok{        ax\_resid.plot([}\DecValTok{4}\NormalTok{, }\DecValTok{14}\NormalTok{], [}\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{], color}\OperatorTok{=}\StringTok{"black"}\NormalTok{)}
+\NormalTok{        ax\_resid.set\_title(}\StringTok{"Residuals"}\NormalTok{)}
+
+    \ControlFlowTok{return}\NormalTok{ fig}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Load in four different datasets: I, II, III, IV}
+\NormalTok{x }\OperatorTok{=}\NormalTok{ [}\DecValTok{10}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{13}\NormalTok{, }\DecValTok{9}\NormalTok{, }\DecValTok{11}\NormalTok{, }\DecValTok{14}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{4}\NormalTok{, }\DecValTok{12}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{5}\NormalTok{]}
+\NormalTok{y1 }\OperatorTok{=}\NormalTok{ [}\FloatTok{8.04}\NormalTok{, }\FloatTok{6.95}\NormalTok{, }\FloatTok{7.58}\NormalTok{, }\FloatTok{8.81}\NormalTok{, }\FloatTok{8.33}\NormalTok{, }\FloatTok{9.96}\NormalTok{, }\FloatTok{7.24}\NormalTok{, }\FloatTok{4.26}\NormalTok{, }\FloatTok{10.84}\NormalTok{, }\FloatTok{4.82}\NormalTok{, }\FloatTok{5.68}\NormalTok{]}
+\NormalTok{y2 }\OperatorTok{=}\NormalTok{ [}\FloatTok{9.14}\NormalTok{, }\FloatTok{8.14}\NormalTok{, }\FloatTok{8.74}\NormalTok{, }\FloatTok{8.77}\NormalTok{, }\FloatTok{9.26}\NormalTok{, }\FloatTok{8.10}\NormalTok{, }\FloatTok{6.13}\NormalTok{, }\FloatTok{3.10}\NormalTok{, }\FloatTok{9.13}\NormalTok{, }\FloatTok{7.26}\NormalTok{, }\FloatTok{4.74}\NormalTok{]}
+\NormalTok{y3 }\OperatorTok{=}\NormalTok{ [}\FloatTok{7.46}\NormalTok{, }\FloatTok{6.77}\NormalTok{, }\FloatTok{12.74}\NormalTok{, }\FloatTok{7.11}\NormalTok{, }\FloatTok{7.81}\NormalTok{, }\FloatTok{8.84}\NormalTok{, }\FloatTok{6.08}\NormalTok{, }\FloatTok{5.39}\NormalTok{, }\FloatTok{8.15}\NormalTok{, }\FloatTok{6.42}\NormalTok{, }\FloatTok{5.73}\NormalTok{]}
+\NormalTok{x4 }\OperatorTok{=}\NormalTok{ [}\DecValTok{8}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{19}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{8}\NormalTok{]}
+\NormalTok{y4 }\OperatorTok{=}\NormalTok{ [}\FloatTok{6.58}\NormalTok{, }\FloatTok{5.76}\NormalTok{, }\FloatTok{7.71}\NormalTok{, }\FloatTok{8.84}\NormalTok{, }\FloatTok{8.47}\NormalTok{, }\FloatTok{7.04}\NormalTok{, }\FloatTok{5.25}\NormalTok{, }\FloatTok{12.50}\NormalTok{, }\FloatTok{5.56}\NormalTok{, }\FloatTok{7.91}\NormalTok{, }\FloatTok{6.89}\NormalTok{]}
+
+\NormalTok{anscombe }\OperatorTok{=}\NormalTok{ \{}
+    \StringTok{"I"}\NormalTok{: pd.DataFrame(}\BuiltInTok{list}\NormalTok{(}\BuiltInTok{zip}\NormalTok{(x, y1)), columns}\OperatorTok{=}\NormalTok{[}\StringTok{"x"}\NormalTok{, }\StringTok{"y"}\NormalTok{]),}
+    \StringTok{"II"}\NormalTok{: pd.DataFrame(}\BuiltInTok{list}\NormalTok{(}\BuiltInTok{zip}\NormalTok{(x, y2)), columns}\OperatorTok{=}\NormalTok{[}\StringTok{"x"}\NormalTok{, }\StringTok{"y"}\NormalTok{]),}
+    \StringTok{"III"}\NormalTok{: pd.DataFrame(}\BuiltInTok{list}\NormalTok{(}\BuiltInTok{zip}\NormalTok{(x, y3)), columns}\OperatorTok{=}\NormalTok{[}\StringTok{"x"}\NormalTok{, }\StringTok{"y"}\NormalTok{]),}
+    \StringTok{"IV"}\NormalTok{: pd.DataFrame(}\BuiltInTok{list}\NormalTok{(}\BuiltInTok{zip}\NormalTok{(x4, y4)), columns}\OperatorTok{=}\NormalTok{[}\StringTok{"x"}\NormalTok{, }\StringTok{"y"}\NormalTok{]),}
+\NormalTok{\}}
+
+\CommentTok{\# Plot the scatter plot and line of best fit}
+\NormalTok{fig, axs }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{2}\NormalTok{, }\DecValTok{2}\NormalTok{, figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{10}\NormalTok{, }\DecValTok{10}\NormalTok{))}
+
+\ControlFlowTok{for}\NormalTok{ i, dataset }\KeywordTok{in} \BuiltInTok{enumerate}\NormalTok{([}\StringTok{"I"}\NormalTok{, }\StringTok{"II"}\NormalTok{, }\StringTok{"III"}\NormalTok{, }\StringTok{"IV"}\NormalTok{]):}
+\NormalTok{    ans }\OperatorTok{=}\NormalTok{ anscombe[dataset]}
+\NormalTok{    x, y }\OperatorTok{=}\NormalTok{ ans[}\StringTok{"x"}\NormalTok{], ans[}\StringTok{"y"}\NormalTok{]}
+\NormalTok{    ahat, bhat }\OperatorTok{=}\NormalTok{ fit\_least\_squares(x, y)}
+\NormalTok{    yhat }\OperatorTok{=}\NormalTok{ predict(x, ahat, bhat)}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].scatter(x, y, alpha}\OperatorTok{=}\FloatTok{0.6}\NormalTok{, color}\OperatorTok{=}\StringTok{"red"}\NormalTok{)  }\CommentTok{\# plot the x, y points}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].plot(x, yhat)  }\CommentTok{\# plot the line of best fit}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].set\_xlabel(}\SpecialStringTok{f"$x\_}\SpecialCharTok{\{}\NormalTok{i}\OperatorTok{+}\DecValTok{1}\SpecialCharTok{\}}\SpecialStringTok{$"}\NormalTok{)}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].set\_ylabel(}\SpecialStringTok{f"$y\_}\SpecialCharTok{\{}\NormalTok{i}\OperatorTok{+}\DecValTok{1}\SpecialCharTok{\}}\SpecialStringTok{$"}\NormalTok{)}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].set\_title(}\SpecialStringTok{f"Dataset }\SpecialCharTok{\{}\NormalTok{dataset}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+
+\NormalTok{plt.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{constant_model_loss_transformations/loss_transformations_files/figure-pdf/cell-5-output-1.pdf}
+
+While these four sets of datapoints look very different, they actually
+all have identical means \(\bar x\), \(\bar y\), standard deviations
+\(\sigma_x\), \(\sigma_y\), correlation \(r\), and RMSE! If we only look
+at these statistics, we would probably be inclined to say that these
+datasets are similar.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ControlFlowTok{for}\NormalTok{ dataset }\KeywordTok{in}\NormalTok{ [}\StringTok{"I"}\NormalTok{, }\StringTok{"II"}\NormalTok{, }\StringTok{"III"}\NormalTok{, }\StringTok{"IV"}\NormalTok{]:}
+    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"\textgreater{}\textgreater{}\textgreater{} Dataset }\SpecialCharTok{\{}\NormalTok{dataset}\SpecialCharTok{\}}\SpecialStringTok{:"}\NormalTok{)}
+\NormalTok{    ans }\OperatorTok{=}\NormalTok{ anscombe[dataset]}
+\NormalTok{    fig }\OperatorTok{=}\NormalTok{ least\_squares\_evaluation(ans[}\StringTok{"x"}\NormalTok{], ans[}\StringTok{"y"}\NormalTok{], visualize}\OperatorTok{=}\NormalTok{NO\_VIZ)}
+    \BuiltInTok{print}\NormalTok{()}
+    \BuiltInTok{print}\NormalTok{()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+>>> Dataset I:
+x_mean : 9.00, y_mean : 7.50
+x_stdev: 3.16, y_stdev: 1.94
+r = Correlation(x, y): 0.816
+    heta_0: 3.00,   heta_1: 0.50
+RMSE: 1.119
+
+
+>>> Dataset II:
+x_mean : 9.00, y_mean : 7.50
+x_stdev: 3.16, y_stdev: 1.94
+r = Correlation(x, y): 0.816
+    heta_0: 3.00,   heta_1: 0.50
+RMSE: 1.119
+
+
+>>> Dataset III:
+x_mean : 9.00, y_mean : 7.50
+x_stdev: 3.16, y_stdev: 1.94
+r = Correlation(x, y): 0.816
+    heta_0: 3.00,   heta_1: 0.50
+RMSE: 1.118
+
+
+>>> Dataset IV:
+x_mean : 9.00, y_mean : 7.50
+x_stdev: 3.16, y_stdev: 1.94
+r = Correlation(x, y): 0.817
+    heta_0: 3.00,   heta_1: 0.50
+RMSE: 1.118
+
+\end{verbatim}
+
+We may also wish to visualize the model's \textbf{residuals}, defined as
+the difference between the observed and predicted \(y_i\) value
+(\(e_i = y_i - \hat{y}_i\)). This gives a high-level view of how ``off''
+each prediction is from the true observed value. Recall that you
+explored this concept in
+\href{https://inferentialthinking.com/chapters/15/5/Visual_Diagnostics.html?highlight=heteroscedasticity\#detecting-heteroscedasticity}{Data
+8}: a good regression fit should display no clear pattern in its plot of
+residuals. The residual plots for Anscombe's quartet are displayed
+below. Note how only the first plot shows no clear pattern to the
+magnitude of residuals. This is an indication that SLR is not the best
+choice of model for the remaining three sets of points.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Residual visualization}
+\NormalTok{fig, axs }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{2}\NormalTok{, }\DecValTok{2}\NormalTok{, figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{10}\NormalTok{, }\DecValTok{10}\NormalTok{))}
+
+\ControlFlowTok{for}\NormalTok{ i, dataset }\KeywordTok{in} \BuiltInTok{enumerate}\NormalTok{([}\StringTok{"I"}\NormalTok{, }\StringTok{"II"}\NormalTok{, }\StringTok{"III"}\NormalTok{, }\StringTok{"IV"}\NormalTok{]):}
+\NormalTok{    ans }\OperatorTok{=}\NormalTok{ anscombe[dataset]}
+\NormalTok{    x, y }\OperatorTok{=}\NormalTok{ ans[}\StringTok{"x"}\NormalTok{], ans[}\StringTok{"y"}\NormalTok{]}
+\NormalTok{    ahat, bhat }\OperatorTok{=}\NormalTok{ fit\_least\_squares(x, y)}
+\NormalTok{    yhat }\OperatorTok{=}\NormalTok{ predict(x, ahat, bhat)}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].scatter(}
+\NormalTok{        x, y }\OperatorTok{{-}}\NormalTok{ yhat, alpha}\OperatorTok{=}\FloatTok{0.6}\NormalTok{, color}\OperatorTok{=}\StringTok{"red"}
+\NormalTok{    )  }\CommentTok{\# plot the x, y points}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].plot(}
+\NormalTok{        x, np.zeros\_like(x), color}\OperatorTok{=}\StringTok{"black"}
+\NormalTok{    )  }\CommentTok{\# plot the residual line}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].set\_xlabel(}\SpecialStringTok{f"$x\_}\SpecialCharTok{\{}\NormalTok{i}\OperatorTok{+}\DecValTok{1}\SpecialCharTok{\}}\SpecialStringTok{$"}\NormalTok{)}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].set\_ylabel(}\SpecialStringTok{f"$e\_}\SpecialCharTok{\{}\NormalTok{i}\OperatorTok{+}\DecValTok{1}\SpecialCharTok{\}}\SpecialStringTok{$"}\NormalTok{)}
+\NormalTok{    axs[i }\OperatorTok{//} \DecValTok{2}\NormalTok{, i }\OperatorTok{\%} \DecValTok{2}\NormalTok{].set\_title(}\SpecialStringTok{f"Dataset }\SpecialCharTok{\{}\NormalTok{dataset}\SpecialCharTok{\}}\SpecialStringTok{ Residuals"}\NormalTok{)}
+
+\NormalTok{plt.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{constant_model_loss_transformations/loss_transformations_files/figure-pdf/cell-7-output-1.pdf}
+
+\section{Constant Model + MSE}\label{constant-model-mse}
+
+Now, we'll shift from the SLR model to the \textbf{constant model}, also
+known as a summary statistic. The constant model is slightly different
+from the simple linear regression model we've explored previously.
+Rather than generating predictions from an inputted feature variable,
+the constant model always \emph{predicts the same constant number}. This
+ignores any relationships between variables. For example, let's say we
+want to predict the number of drinks a boba shop sells in a day. Boba
+tea sales likely depend on the time of year, the weather, how the
+customers feel, whether school is in session, etc., but the constant
+model ignores these factors in favor of a simpler model. In other words,
+the constant model employs a \textbf{simplifying assumption}.
+
+It is also a parametric, statistical model:
+
+\[\hat{y} = \theta_0\]
+
+\(\theta_0\) is the parameter of the constant model, just as
+\(\theta_0\) and \(\theta_1\) were the parameters in SLR. Since our
+parameter \(\theta_0\) is 1-dimensional (\(\theta_0 \in \mathbb{R}\)),
+we now have no input to our model and will always predict
+\(\hat{y} = \theta_0\).
+
+\subsection{\texorpdfstring{Deriving the optimal
+\(\theta_0\)}{Deriving the optimal \textbackslash theta\_0}}\label{deriving-the-optimal-theta_0}
+
+Our task now is to determine what value of \(\theta_0\) best represents
+the optimal model -- in other words, what number should we guess each
+time to have the lowest possible \textbf{average loss} on our data?
+
+Like before, we'll use Mean Squared Error (MSE). Recall that the MSE is
+average squared loss (L2 loss) over the data
+\(D = \{y_1, y_2, ..., y_n\}\).
+
+\[\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \hat{y_i})^2 \]
+
+Our modeling process now looks like this:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Choose a model: constant model
+\item
+  Choose a loss function: L2 loss
+\item
+  Fit the model
+\item
+  Evaluate model performance
+\end{enumerate}
+
+Given the \textbf{constant model} \(\hat{y} = \theta_0\), we can rewrite
+the MSE equation as
+
+\[\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2 \]
+
+We can fit \textbf{the model} by finding the optimal \(\hat{\theta_0}\)
+that minimizes the MSE using a calculus approach.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Differentiate with respect to \(\theta_0\):
+\end{enumerate}
+
+\[
+\begin{align}
+\frac{d}{d\theta_0}\text{R}(\theta) & = \frac{d}{d\theta_0}(\frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2)
+\\ &= \frac{1}{n}\sum^{n}_{i=1} \frac{d}{d\theta_0}  (y_i - \theta_0)^2 \quad \quad \text{a derivative of sums is a sum of derivatives}
+\\ &= \frac{1}{n}\sum^{n}_{i=1} 2 (y_i - \theta_0) (-1) \quad \quad \text{chain rule}
+\\ &= {\frac{-2}{n}}\sum^{n}_{i=1} (y_i - \theta_0) \quad \quad \text{simply constants}
+\end{align}
+\]
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{1}
+\item
+  Set the derivative equation equal to 0:
+
+  \[
+  0 = {\frac{-2}{n}}\sum^{n}_{i=1} (y_i - \hat{\theta_0})
+  \]
+\item
+  Solve for \(\hat{\theta_0}\)
+\end{enumerate}
+
+\[
+\begin{align}
+0 &= {\frac{-2}{n}}\sum^{n}_{i=1} (y_i - \hat{\theta_0})
+\\ &= \sum^{n}_{i=1} (y_i - \hat{\theta_0}) \quad \quad \text{divide both sides by} \frac{-2}{n}
+\\ &= \left(\sum^{n}_{i=1} y_i\right) - \left(\sum^{n}_{i=1} \theta_0\right) \quad \quad \text{separate sums}
+\\ &= \left(\sum^{n}_{i=1} y_i\right) - (n \cdot \hat{\theta_0}) \quad \quad  \text{c + c + … + c = nc}
+\\ n \cdot \hat{\theta_0} &= \sum^{n}_{i=1} y_i
+\\ \hat{\theta_0} &= \frac{1}{n} \sum^{n}_{i=1} y_i
+\\ \hat{\theta_0} &= \bar{y}
+\end{align}
+\]
+
+Let's take a moment to interpret this result.
+\(\hat{\theta_0} = \bar{y}\) is the optimal parameter for constant model
++ MSE. It holds true regardless of what data sample you have, and it
+provides some formal reasoning as to why the mean is such a common
+summary statistic.
+
+Our optimal model parameter is the value of the parameter that minimizes
+the cost function. This minimum value of the cost function can be
+expressed:
+
+\[R(\hat{\theta_0}) = \min_{\theta_0} R(\theta_0)\]
+
+To restate the above in plain English: we are looking at the value of
+the cost function when it takes the best parameter as input. This
+optimal model parameter, \(\hat{\theta_0}\), is the value of
+\(\theta_0\) that minimizes the cost \(R\).
+
+For modeling purposes, we care less about the minimum value of cost,
+\(R(\hat{\theta_0})\), and more about the \emph{value of \(\theta\)}
+that results in this lowest average loss. In other words, we concern
+ourselves with finding the best parameter value such that:
+
+\[\hat{\theta} = \underset{\theta}{\operatorname{\arg\min}}\:R(\theta)\]
+
+That is, we want to find the \textbf{arg}ument \(\theta\) that
+\textbf{min}imizes the cost function.
+
+\subsection{Comparing Two Different Models, Both Fit with
+MSE}\label{comparing-two-different-models-both-fit-with-mse}
+
+Now that we've explored the constant model with an L2 loss, we can
+compare it to the SLR model that we learned last lecture. Consider the
+dataset below, which contains information about the ages and lengths of
+dugongs. Supposed we wanted to predict dugong ages:
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.1333}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3879}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.4788}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Constant Model
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Simple Linear Regression
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+model & \(\hat{y} = \theta_0\) & \(\hat{y} = \theta_0 + \theta_1 x\) \\
+data & sample of ages \(D = \{y_1, y_2, ..., y_n\}\) & sample of ages
+\(D = \{(x_1, y_1), (x_2, y_2), ..., (x_n, y_n)\}\) \\
+dimensions & \(\hat{\theta_0}\) is 1-D &
+\(\hat{\theta} = [\hat{\theta_0}, \hat{\theta_1}]\) is 2-D \\
+loss surface & 2-D
+\includegraphics{constant_model_loss_transformations/images/constant_loss_surface.png}
+& 3-D
+\includegraphics{constant_model_loss_transformations/images/slr_loss_surface.png} \\
+loss model &
+\(\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2\) &
+\(\hat{R}(\theta_0, \theta_1) = \frac{1}{n}\sum^{n}_{i=1} (y_i - (\theta_0 + \theta_1 x))^2\) \\
+RMSE & 7.72 & 4.31 \\
+predictions visualized & rug plot
+\includegraphics{constant_model_loss_transformations/images/dugong_rug.png}
+& scatter plot
+\includegraphics{constant_model_loss_transformations/images/dugong_scatter.png} \\
+\end{longtable}
+
+(Notice how the points for our SLR scatter plot are visually not a great
+linear fit. We'll come back to this).
+
+The code for generating the graphs and models is included below, but we
+won't go over it in too much depth.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{dugongs }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/dugongs.csv"}\NormalTok{)}
+\NormalTok{data\_constant }\OperatorTok{=}\NormalTok{ dugongs[}\StringTok{"Age"}\NormalTok{]}
+\NormalTok{data\_linear }\OperatorTok{=}\NormalTok{ dugongs[[}\StringTok{"Length"}\NormalTok{, }\StringTok{"Age"}\NormalTok{]]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Constant Model + MSE}
+\NormalTok{plt.style.use(}\StringTok{\textquotesingle{}default\textquotesingle{}}\NormalTok{) }\CommentTok{\# Revert style to default mpl}
+\NormalTok{adjust\_fontsize(size}\OperatorTok{=}\DecValTok{16}\NormalTok{)}
+\OperatorTok{\%}\NormalTok{matplotlib inline}
+
+\KeywordTok{def}\NormalTok{ mse\_constant(theta, data):}
+    \ControlFlowTok{return}\NormalTok{ np.mean(np.array([(y\_obs }\OperatorTok{{-}}\NormalTok{ theta) }\OperatorTok{**} \DecValTok{2} \ControlFlowTok{for}\NormalTok{ y\_obs }\KeywordTok{in}\NormalTok{ data]), axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+
+\NormalTok{thetas }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{20}\NormalTok{, }\DecValTok{42}\NormalTok{, }\DecValTok{1000}\NormalTok{)}
+\NormalTok{l2\_loss\_thetas }\OperatorTok{=}\NormalTok{ mse\_constant(thetas, data\_constant)}
+
+\CommentTok{\# Plotting the loss surface}
+\NormalTok{plt.plot(thetas, l2\_loss\_thetas)}
+\NormalTok{plt.xlabel(}\VerbatimStringTok{r\textquotesingle{}$\textbackslash{}theta\_0$\textquotesingle{}}\NormalTok{)}
+\NormalTok{plt.ylabel(}\VerbatimStringTok{r\textquotesingle{}MSE\textquotesingle{}}\NormalTok{)}
+
+\CommentTok{\# Optimal point}
+\NormalTok{thetahat }\OperatorTok{=}\NormalTok{ np.mean(data\_constant)}
+\NormalTok{plt.scatter([thetahat], [mse\_constant(thetahat, data\_constant)], s}\OperatorTok{=}\DecValTok{50}\NormalTok{, label }\OperatorTok{=} \VerbatimStringTok{r"$\textbackslash{}hat\{\textbackslash{}theta\}\_0$"}\NormalTok{)}
+\NormalTok{plt.legend()}\OperatorTok{;}
+\CommentTok{\# plt.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{constant_model_loss_transformations/loss_transformations_files/figure-pdf/cell-9-output-1.pdf}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# SLR + MSE}
+\KeywordTok{def}\NormalTok{ mse\_linear(theta\_0, theta\_1, data\_linear):}
+\NormalTok{    data\_x, data\_y }\OperatorTok{=}\NormalTok{ data\_linear.iloc[:, }\DecValTok{0}\NormalTok{], data\_linear.iloc[:, }\DecValTok{1}\NormalTok{]}
+    \ControlFlowTok{return}\NormalTok{ np.mean(}
+\NormalTok{        np.array([(y }\OperatorTok{{-}}\NormalTok{ (theta\_0 }\OperatorTok{+}\NormalTok{ theta\_1 }\OperatorTok{*}\NormalTok{ x)) }\OperatorTok{**} \DecValTok{2} \ControlFlowTok{for}\NormalTok{ x, y }\KeywordTok{in} \BuiltInTok{zip}\NormalTok{(data\_x, data\_y)]),}
+\NormalTok{        axis}\OperatorTok{=}\DecValTok{0}\NormalTok{,}
+\NormalTok{    )}
+
+
+\CommentTok{\# plotting the loss surface}
+\NormalTok{theta\_0\_values }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{80}\NormalTok{, }\DecValTok{20}\NormalTok{, }\DecValTok{80}\NormalTok{)}
+\NormalTok{theta\_1\_values }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{10}\NormalTok{, }\DecValTok{30}\NormalTok{, }\DecValTok{80}\NormalTok{)}
+\NormalTok{mse\_values }\OperatorTok{=}\NormalTok{ np.array(}
+\NormalTok{    [[mse\_linear(x, y, data\_linear) }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ theta\_0\_values] }\ControlFlowTok{for}\NormalTok{ y }\KeywordTok{in}\NormalTok{ theta\_1\_values]}
+\NormalTok{)}
+
+\CommentTok{\# Optimal point}
+\NormalTok{data\_x, data\_y }\OperatorTok{=}\NormalTok{ data\_linear.iloc[:, }\DecValTok{0}\NormalTok{], data\_linear.iloc[:, }\DecValTok{1}\NormalTok{]}
+\NormalTok{theta\_1\_hat }\OperatorTok{=}\NormalTok{ np.corrcoef(data\_x, data\_y)[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{] }\OperatorTok{*}\NormalTok{ np.std(data\_y) }\OperatorTok{/}\NormalTok{ np.std(data\_x)}
+\NormalTok{theta\_0\_hat }\OperatorTok{=}\NormalTok{ np.mean(data\_y) }\OperatorTok{{-}}\NormalTok{ theta\_1\_hat }\OperatorTok{*}\NormalTok{ np.mean(data\_x)}
+
+\CommentTok{\# Create the 3D plot}
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{7}\NormalTok{, }\DecValTok{5}\NormalTok{))}
+\NormalTok{ax }\OperatorTok{=}\NormalTok{ fig.add\_subplot(}\DecValTok{111}\NormalTok{, projection}\OperatorTok{=}\StringTok{"3d"}\NormalTok{)}
+
+\NormalTok{X, Y }\OperatorTok{=}\NormalTok{ np.meshgrid(theta\_0\_values, theta\_1\_values)}
+\NormalTok{surf }\OperatorTok{=}\NormalTok{ ax.plot\_surface(}
+\NormalTok{    X, Y, mse\_values, cmap}\OperatorTok{=}\StringTok{"viridis"}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.6}
+\NormalTok{)  }\CommentTok{\# Use alpha to make it slightly transparent}
+
+\CommentTok{\# Scatter point using matplotlib}
+\NormalTok{sc }\OperatorTok{=}\NormalTok{ ax.scatter(}
+\NormalTok{    [theta\_0\_hat],}
+\NormalTok{    [theta\_1\_hat],}
+\NormalTok{    [mse\_linear(theta\_0\_hat, theta\_1\_hat, data\_linear)],}
+\NormalTok{    marker}\OperatorTok{=}\StringTok{"o"}\NormalTok{,}
+\NormalTok{    color}\OperatorTok{=}\StringTok{"red"}\NormalTok{,}
+\NormalTok{    s}\OperatorTok{=}\DecValTok{100}\NormalTok{,}
+\NormalTok{    label}\OperatorTok{=}\StringTok{"theta hat"}\NormalTok{,}
+\NormalTok{)}
+
+\CommentTok{\# Create a colorbar}
+\NormalTok{cbar }\OperatorTok{=}\NormalTok{ fig.colorbar(surf, ax}\OperatorTok{=}\NormalTok{ax, shrink}\OperatorTok{=}\FloatTok{0.5}\NormalTok{, aspect}\OperatorTok{=}\DecValTok{10}\NormalTok{)}
+\NormalTok{cbar.set\_label(}\StringTok{"Cost Value"}\NormalTok{)}
+
+\NormalTok{ax.set\_title(}\StringTok{"MSE for different $}\CharTok{\textbackslash{}\textbackslash{}}\StringTok{theta\_0, }\CharTok{\textbackslash{}\textbackslash{}}\StringTok{theta\_1$"}\NormalTok{)}
+\NormalTok{ax.set\_xlabel(}\StringTok{"$}\CharTok{\textbackslash{}\textbackslash{}}\StringTok{theta\_0$"}\NormalTok{)}
+\NormalTok{ax.set\_ylabel(}\StringTok{"$}\CharTok{\textbackslash{}\textbackslash{}}\StringTok{theta\_1$"}\NormalTok{)}
+\NormalTok{ax.set\_zlabel(}\StringTok{"MSE"}\NormalTok{)}
+
+\CommentTok{\# plt.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Text(0.5, 0, 'MSE')
+\end{verbatim}
+
+\includegraphics{constant_model_loss_transformations/loss_transformations_files/figure-pdf/cell-10-output-2.pdf}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Predictions}
+\NormalTok{yobs }\OperatorTok{=}\NormalTok{ data\_linear[}\StringTok{"Age"}\NormalTok{]  }\CommentTok{\# The true observations y}
+\NormalTok{xs }\OperatorTok{=}\NormalTok{ data\_linear[}\StringTok{"Length"}\NormalTok{]  }\CommentTok{\# Needed for linear predictions}
+\NormalTok{n }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(yobs)  }\CommentTok{\# Predictions}
+
+\NormalTok{yhats\_constant }\OperatorTok{=}\NormalTok{ [thetahat }\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(n)]  }\CommentTok{\# Not used, but food for thought}
+\NormalTok{yhats\_linear }\OperatorTok{=}\NormalTok{ [theta\_0\_hat }\OperatorTok{+}\NormalTok{ theta\_1\_hat }\OperatorTok{*}\NormalTok{ x }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ xs]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Constant Model Rug Plot}
+\CommentTok{\# In case we\textquotesingle{}re in a weird style state}
+\NormalTok{sns.set\_theme()}
+\NormalTok{adjust\_fontsize(size}\OperatorTok{=}\DecValTok{16}\NormalTok{)}
+\OperatorTok{\%}\NormalTok{matplotlib inline}
+
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{8}\NormalTok{, }\FloatTok{1.5}\NormalTok{))}
+\NormalTok{sns.rugplot(yobs, height}\OperatorTok{=}\FloatTok{0.25}\NormalTok{, lw}\OperatorTok{=}\DecValTok{2}\NormalTok{) }\OperatorTok{;}
+\NormalTok{plt.axvline(thetahat, color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, lw}\OperatorTok{=}\DecValTok{4}\NormalTok{, label}\OperatorTok{=}\VerbatimStringTok{r"$\textbackslash{}hat\{\textbackslash{}theta\}\_0$"}\NormalTok{)}\OperatorTok{;}
+\NormalTok{plt.legend()}
+\NormalTok{plt.yticks([])}\OperatorTok{;}
+\CommentTok{\# plt.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{constant_model_loss_transformations/loss_transformations_files/figure-pdf/cell-12-output-1.pdf}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# SLR model scatter plot }
+\CommentTok{\# In case we\textquotesingle{}re in a weird style state}
+\NormalTok{sns.set\_theme()}
+\NormalTok{adjust\_fontsize(size}\OperatorTok{=}\DecValTok{16}\NormalTok{)}
+\OperatorTok{\%}\NormalTok{matplotlib inline}
+
+\NormalTok{sns.scatterplot(x}\OperatorTok{=}\NormalTok{xs, y}\OperatorTok{=}\NormalTok{yobs)}
+\NormalTok{plt.plot(xs, yhats\_linear, color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, lw}\OperatorTok{=}\DecValTok{4}\NormalTok{)}\OperatorTok{;}
+\CommentTok{\# plt.savefig(\textquotesingle{}dugong\_line.png\textquotesingle{}, bbox\_inches = \textquotesingle{}tight\textquotesingle{});}
+\CommentTok{\# plt.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{constant_model_loss_transformations/loss_transformations_files/figure-pdf/cell-13-output-1.pdf}
+
+Interpreting the RMSE (Root Mean Squared Error):
+
+\begin{itemize}
+\tightlist
+\item
+  Because the constant error is \textbf{HIGHER} than the linear error,
+\item
+  The constant model is \textbf{WORSE} than the linear model (at least
+  for this metric).
+\end{itemize}
+
+\section{Constant Model + MAE}\label{constant-model-mae}
+
+We see now that changing the model used for prediction leads to a wildly
+different result for the optimal model parameter. What happens if we
+instead change the loss function used in model evaluation?
+
+This time, we will consider the constant model with L1 (absolute loss)
+as the loss function. This means that the average loss will be expressed
+as the \textbf{Mean Absolute Error (MAE)}.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Choose a model: constant model
+\item
+  Choose a loss function: L1 loss
+\item
+  Fit the model
+\item
+  Evaluate model performance
+\end{enumerate}
+
+\subsection{\texorpdfstring{Deriving the optimal
+\(\theta_0\)}{Deriving the optimal \textbackslash theta\_0}}\label{deriving-the-optimal-theta_0-1}
+
+Recall that the MAE is average \textbf{absolute} loss (L1 loss) over the
+data \(D = \{y_1, y_2, ..., y_n\}\).
+
+\[\hat{R}(\theta_0) = \frac{1}{n}\sum^{n}_{i=1} |y_i - \hat{y_i}| \]
+
+Given the constant model \(\hat{y} = \theta_0\), we can write the MAE
+as:
+
+\[\hat{R}(\theta_0) = \frac{1}{n}\sum^{n}_{i=1} |y_i - \theta_0| \]
+
+To fit the model, we find the optimal parameter value \(\hat{\theta_0}\)
+that minimizes the MAE by differentiating using a calculus approach:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Differentiate with respect to \(\hat{\theta_0}\):
+\end{enumerate}
+
+\[
+\begin{align}
+\hat{R}(\theta_0) &= \frac{1}{n}\sum^{n}_{i=1} |y_i - \theta_0| \\
+\frac{d}{d\theta_0} R(\theta_0) &= \frac{d}{d\theta_0} \left(\frac{1}{n} \sum^{n}_{i=1} |y_i - \theta_0| \right) \\
+&= \frac{1}{n} \sum^{n}_{i=1} \frac{d}{d\theta_0} |y_i - \theta_0|
+\end{align}
+\]
+
+\begin{itemize}
+\tightlist
+\item
+  Here, we seem to have run into a problem: the derivative of an
+  absolute value is undefined when the argument is 0 (i.e.~when
+  \(y_i = \theta_0\)). For now, we'll ignore this issue. It turns out
+  that disregarding this case doesn't influence our final result.
+\item
+  To perform the derivative, consider two cases. When \(\theta_0\) is
+  \emph{less than or equal to} \(y_i\), the term \(y_i - \theta_0\) will
+  be positive and the absolute value has no impact. When \(\theta_0\) is
+  \emph{greater than} \(y_i\), the term \(y_i - \theta_0\) will be
+  negative. Applying the absolute value will convert this to a positive
+  value, which we can express by saying
+  \(-(y_i - \theta_0) = \theta_0 - y_i\).
+\end{itemize}
+
+\[|y_i - \theta_0| = \begin{cases} y_i - \theta_0 \quad \text{ if } \theta_0 \le y_i \\ \theta_0 - y_i \quad \text{if }\theta_0 > y_i \end{cases}\]
+
+\begin{itemize}
+\tightlist
+\item
+  Taking derivatives:
+\end{itemize}
+
+\[\frac{d}{d\theta_0} |y_i - \theta_0| = \begin{cases} \frac{d}{d\theta_0} (y_i - \theta_0) = -1 \quad \text{if }\theta_0 < y_i \\ \frac{d}{d\theta_0} (\theta_0 - y_i) = 1 \quad \text{if }\theta_0 > y_i \end{cases}\]
+
+\begin{itemize}
+\tightlist
+\item
+  This means that we obtain a different value for the derivative for
+  data points where \(\theta_0 < y_i\) and where \(\theta_0 > y_i\). We
+  can summarize this by saying:
+\end{itemize}
+
+\[
+\frac{d}{d\theta_0} R(\theta_0) = \frac{1}{n} \sum^{n}_{i=1} \frac{d}{d\theta_0} |y_i - \theta_0| \\
+= \frac{1}{n} \left[\sum_{\theta_0 < y_i} (-1) + \sum_{\theta_0 > y_i} (+1) \right]
+\]
+
+\begin{itemize}
+\tightlist
+\item
+  In other words, we take the sum of values for \(i = 1, 2, ..., n\):
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \(-1\) if our observation \(y_i\) is \emph{greater than} our
+    prediction \(\hat{\theta_0}\)
+  \item
+    \(+1\) if our observation \(y_i\) is \emph{smaller than} our
+    prediction \(\hat{\theta_0}\)
+  \end{itemize}
+\end{itemize}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{1}
+\item
+  Set the derivative equation equal to 0:
+  \[ 0 = \frac{1}{n}\sum_{\hat{\theta_0} < y_i} (-1) + \frac{1}{n}\sum_{\hat{\theta_0} > y_i} (+1) \]
+\item
+  Solve for \(\hat{\theta_0}\):
+  \[ 0 = -\frac{1}{n}\sum_{\hat{\theta_0} < y_i} (1) + \frac{1}{n}\sum_{\hat{\theta_0} > y_i} (1)\]
+\end{enumerate}
+
+\[\sum_{\hat{\theta_0} < y_i} (1) = \sum_{\hat{\theta_0} > y_i} (1) \]
+
+Thus, the constant model parameter \(\theta = \hat{\theta_0}\) that
+minimizes MAE must satisfy:
+
+\[ \sum_{\hat{\theta_0} < y_i} (1) = \sum_{\hat{\theta_0} > y_i} (1) \]
+
+In other words, the number of observations greater than \(\theta_0\)
+must be equal to the number of observations less than \(\theta_0\);
+there must be an equal number of points on the left and right sides of
+the equation. This is the definition of median, so our optimal value is
+\[ \hat{\theta_0} = median(y) \]
+
+\section{Summary: Loss Optimization, Calculus, and Critical
+Points}\label{summary-loss-optimization-calculus-and-critical-points}
+
+First, define the \textbf{objective function} as average loss.
+
+\begin{itemize}
+\tightlist
+\item
+  Plug in L1 or L2 loss.
+\item
+  Plug in the model so that the resulting expression is a function of
+  \(\theta\).
+\end{itemize}
+
+Then, find the minimum of the objective function:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Differentiate with respect to \(\theta\).
+\item
+  Set equal to 0.
+\item
+  Solve for \(\hat{\theta}\).
+\item
+  (If we have multiple parameters) repeat steps 1-3 with partial
+  derivatives.
+\end{enumerate}
+
+Recall critical points from calculus: \(R(\hat{\theta})\) could be a
+minimum, maximum, or saddle point!
+
+\begin{itemize}
+\tightlist
+\item
+  We should technically also perform the second derivative test, i.e.,
+  show \(R''(\hat{\theta}) > 0\).
+\item
+  MSE has a property---\textbf{convexity}---that guarantees that
+  \(R(\hat{\theta})\) is a global minimum.
+\item
+  The proof of convexity for MAE is beyond this course.
+\end{itemize}
+
+\section{Comparing Loss Functions}\label{comparing-loss-functions}
+
+We've now tried our hand at fitting a model under both MSE and MAE cost
+functions. How do the two results compare?
+
+Let's consider a dataset where each entry represents the number of
+drinks sold at a bubble tea store each day. We'll fit a constant model
+to predict the number of drinks that will be sold tomorrow.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{drinks }\OperatorTok{=}\NormalTok{ np.array([}\DecValTok{20}\NormalTok{, }\DecValTok{21}\NormalTok{, }\DecValTok{22}\NormalTok{, }\DecValTok{29}\NormalTok{, }\DecValTok{33}\NormalTok{])}
+\NormalTok{drinks}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([20, 21, 22, 29, 33])
+\end{verbatim}
+
+From our derivations above, we know that the optimal model parameter
+under MSE cost is the mean of the dataset. Under MAE cost, the optimal
+parameter is the median of the dataset.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{np.mean(drinks), np.median(drinks)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(np.float64(25.0), np.float64(22.0))
+\end{verbatim}
+
+If we plot each empirical risk function across several possible values
+of \(\theta\), we find that each \(\hat{\theta}\) does indeed correspond
+to the lowest value of error:
+
+Notice that the MSE above is a \textbf{smooth} function -- it is
+differentiable at all points, making it easy to minimize using numerical
+methods. The MAE, in contrast, is not differentiable at each of its
+``kinks.'' We'll explore how the smoothness of the cost function can
+impact our ability to apply numerical optimization in a few weeks.
+
+How do outliers affect each cost function? Imagine we replace the
+largest value in the dataset with 1000. The mean of the data increases
+substantially, while the median is nearly unaffected.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{drinks\_with\_outlier }\OperatorTok{=}\NormalTok{ np.append(drinks, }\DecValTok{1033}\NormalTok{)}
+\NormalTok{display(drinks\_with\_outlier)}
+\NormalTok{np.mean(drinks\_with\_outlier), np.median(drinks\_with\_outlier)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([  20,   21,   22,   29,   33, 1033])
+\end{verbatim}
+
+\begin{verbatim}
+(np.float64(193.0), np.float64(25.5))
+\end{verbatim}
+
+This means that under the MSE, the optimal model parameter
+\(\hat{\theta}\) is strongly affected by the presence of outliers. Under
+the MAE, the optimal parameter is not as influenced by outlying data. We
+can generalize this by saying that the MSE is \textbf{sensitive} to
+outliers, while the MAE is \textbf{robust} to outliers.
+
+Let's try another experiment. This time, we'll add an additional,
+non-outlying datapoint to the data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{drinks\_with\_additional\_observation }\OperatorTok{=}\NormalTok{ np.append(drinks, }\DecValTok{35}\NormalTok{)}
+\NormalTok{drinks\_with\_additional\_observation}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([20, 21, 22, 29, 33, 35])
+\end{verbatim}
+
+When we again visualize the cost functions, we find that the MAE now
+plots a horizontal line between 22 and 29. This means that there are
+\emph{infinitely} many optimal values for the model parameter: any value
+\(\hat{\theta} \in [22, 29]\) will minimize the MAE. In contrast, the
+MSE still has a single best value for \(\hat{\theta}\). In other words,
+the MSE has a \textbf{unique} solution for \(\hat{\theta}\); the MAE is
+not guaranteed to have a single unique solution.
+
+To summarize our example,
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.1333}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3879}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.4788}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+MSE (Mean Squared Loss)
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+MAE (Mean Absolute Loss)
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Loss Function &
+\(\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} (y_i - \theta_0)^2\) &
+\(\hat{R}(\theta) = \frac{1}{n}\sum^{n}_{i=1} |y_i - \theta_0|\) \\
+Optimal \(\hat{\theta_0}\) & \(\hat{\theta_0} = mean(y) = \bar{y}\) &
+\(\hat{\theta_0} = median(y)\) \\
+Loss Surface & & \\
+Shape & \textbf{Smooth} - easy to minimize using numerical methods (in a
+few weeks) & \textbf{Piecewise} - at each of the ``kinks,'' it's not
+differentiable. Harder to minimize. \\
+Outliers & \textbf{Sensitive} to outliers (since they change mean
+substantially). Sensitivity also depends on the dataset size. &
+\textbf{More robust} to outliers. \\
+\(\hat{\theta_0}\) Uniqueness & \textbf{Unique} \(\hat{\theta_0}\) &
+\textbf{Infinitely many} \(\hat{\theta_0}\)s \\
+\end{longtable}
+
+\section{Transformations to fit Linear
+Models}\label{transformations-to-fit-linear-models}
+
+At this point, we have an effective method of fitting models to predict
+linear relationships. Given a feature variable and target, we can apply
+our four-step process to find the optimal model parameters.
+
+A key word above is \emph{linear}. When we computed parameter estimates
+earlier, we assumed that \(x_i\) and \(y_i\) shared a roughly linear
+relationship. Data in the real world isn't always so straightforward,
+but we can transform the data to try and obtain linearity.
+
+The \textbf{Tukey-Mosteller Bulge Diagram} is a useful tool for
+summarizing what transformations can linearize the relationship between
+two variables. To determine what transformations might be appropriate,
+trace the shape of the ``bulge'' made by your data. Find the quadrant of
+the diagram that matches this bulge. The transformations shown on the
+vertical and horizontal axes of this quadrant can help improve the fit
+between the variables.
+
+Note that:
+
+\begin{itemize}
+\tightlist
+\item
+  There are multiple solutions. Some will fit better than others.
+\item
+  sqrt and log make a value ``smaller.''
+\item
+  Raising to a power makes a value ``bigger.''
+\item
+  Each of these transformations equates to increasing or decreasing the
+  scale of an axis.
+\end{itemize}
+
+Other goals in addition to linearity are possible, for example, making
+data appear more symmetric. Linearity allows us to fit lines to the
+transformed data.
+
+Let's revisit our dugongs example. The lengths and ages are plotted
+below:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# \textasciigrave{}corrcoef\textasciigrave{} computes the correlation coefficient between two variables}
+\CommentTok{\# \textasciigrave{}std\textasciigrave{} finds the standard deviation}
+\NormalTok{x }\OperatorTok{=}\NormalTok{ dugongs[}\StringTok{"Length"}\NormalTok{]}
+\NormalTok{y }\OperatorTok{=}\NormalTok{ dugongs[}\StringTok{"Age"}\NormalTok{]}
+\NormalTok{r }\OperatorTok{=}\NormalTok{ np.corrcoef(x, y)[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{]}
+\NormalTok{theta\_1 }\OperatorTok{=}\NormalTok{ r }\OperatorTok{*}\NormalTok{ np.std(y) }\OperatorTok{/}\NormalTok{ np.std(x)}
+\NormalTok{theta\_0 }\OperatorTok{=}\NormalTok{ np.mean(y) }\OperatorTok{{-}}\NormalTok{ theta\_1 }\OperatorTok{*}\NormalTok{ np.mean(x)}
+
+\NormalTok{fig, ax }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, dpi}\OperatorTok{=}\DecValTok{200}\NormalTok{, figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{8}\NormalTok{, }\DecValTok{3}\NormalTok{))}
+\NormalTok{ax[}\DecValTok{0}\NormalTok{].scatter(x, y)}
+\NormalTok{ax[}\DecValTok{0}\NormalTok{].set\_xlabel(}\StringTok{"Length"}\NormalTok{)}
+\NormalTok{ax[}\DecValTok{0}\NormalTok{].set\_ylabel(}\StringTok{"Age"}\NormalTok{)}
+
+\NormalTok{ax[}\DecValTok{1}\NormalTok{].scatter(x, y)}
+\NormalTok{ax[}\DecValTok{1}\NormalTok{].plot(x, theta\_0 }\OperatorTok{+}\NormalTok{ theta\_1 }\OperatorTok{*}\NormalTok{ x, }\StringTok{"tab:red"}\NormalTok{)}
+\NormalTok{ax[}\DecValTok{1}\NormalTok{].set\_xlabel(}\StringTok{"Length"}\NormalTok{)}
+\NormalTok{ax[}\DecValTok{1}\NormalTok{].set\_ylabel(}\StringTok{"Age"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Text(0, 0.5, 'Age')
+\end{verbatim}
+
+\includegraphics{constant_model_loss_transformations/loss_transformations_files/figure-pdf/cell-18-output-2.pdf}
+
+Looking at the plot on the left, we see that there is a slight curvature
+to the data points. Plotting the SLR curve on the right results in a
+poor fit.
+
+For SLR to perform well, we'd like there to be a rough linear trend
+relating \texttt{"Age"} and \texttt{"Length"}. What is making the raw
+data deviate from a linear relationship? Notice that the data points
+with \texttt{"Length"} greater than 2.6 have disproportionately high
+values of \texttt{"Age"} relative to the rest of the data. If we could
+manipulate these data points to have lower \texttt{"Age"} values, we'd
+``shift'' these points downwards and reduce the curvature in the data.
+Applying a logarithmic transformation to \(y_i\) (that is, taking
+\(\log(\) \texttt{"Age"} \()\) ) would achieve just that.
+
+An important word on \(\log\): in Data 100 (and most upper-division STEM
+courses), \(\log\) denotes the natural logarithm with base \(e\). The
+base-10 logarithm, where relevant, is indicated by \(\log_{10}\).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{z }\OperatorTok{=}\NormalTok{ np.log(y)}
+
+\NormalTok{r }\OperatorTok{=}\NormalTok{ np.corrcoef(x, z)[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{]}
+\NormalTok{theta\_1 }\OperatorTok{=}\NormalTok{ r }\OperatorTok{*}\NormalTok{ np.std(z) }\OperatorTok{/}\NormalTok{ np.std(x)}
+\NormalTok{theta\_0 }\OperatorTok{=}\NormalTok{ np.mean(z) }\OperatorTok{{-}}\NormalTok{ theta\_1 }\OperatorTok{*}\NormalTok{ np.mean(x)}
+
+\NormalTok{fig, ax }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, dpi}\OperatorTok{=}\DecValTok{200}\NormalTok{, figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{8}\NormalTok{, }\DecValTok{3}\NormalTok{))}
+\NormalTok{ax[}\DecValTok{0}\NormalTok{].scatter(x, z)}
+\NormalTok{ax[}\DecValTok{0}\NormalTok{].set\_xlabel(}\StringTok{"Length"}\NormalTok{)}
+\NormalTok{ax[}\DecValTok{0}\NormalTok{].set\_ylabel(}\VerbatimStringTok{r"$\textbackslash{}log\{(Age)\}$"}\NormalTok{)}
+
+\NormalTok{ax[}\DecValTok{1}\NormalTok{].scatter(x, z)}
+\NormalTok{ax[}\DecValTok{1}\NormalTok{].plot(x, theta\_0 }\OperatorTok{+}\NormalTok{ theta\_1 }\OperatorTok{*}\NormalTok{ x, }\StringTok{"tab:red"}\NormalTok{)}
+\NormalTok{ax[}\DecValTok{1}\NormalTok{].set\_xlabel(}\StringTok{"Length"}\NormalTok{)}
+\NormalTok{ax[}\DecValTok{1}\NormalTok{].set\_ylabel(}\VerbatimStringTok{r"$\textbackslash{}log\{(Age)\}$"}\NormalTok{)}
+
+\NormalTok{plt.subplots\_adjust(wspace}\OperatorTok{=}\FloatTok{0.3}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{constant_model_loss_transformations/loss_transformations_files/figure-pdf/cell-19-output-1.pdf}
+
+Our SLR fit looks a lot better! We now have a new target variable: the
+SLR model is now trying to predict the \emph{log} of \texttt{"Age"},
+rather than the untransformed \texttt{"Age"}. In other words, we are
+applying the transformation \(z_i = \log{(y_i)}\). Notice that the
+resulting model is still \textbf{linear in the parameters}
+\(\theta = [\theta_0, \theta_1]\). The SLR model becomes:
+
+\[\hat{\log{y}} = \theta_0 + \theta_1 x\]
+\[\hat{z} = \theta_0 + \theta_1 x\]
+
+It turns out that this linearized relationship can help us understand
+the underlying relationship between \(x\) and \(y\). If we rearrange the
+relationship above, we find:
+
+\[\log{(y)} = \theta_0 + \theta_1 x\] \[y = e^{\theta_0 + \theta_1 x}\]
+\[y = (e^{\theta_0})e^{\theta_1 x}\] \[y_i = C e^{k x}\]
+
+For some constants \(C\) and \(k\).
+
+\(y\) is an \emph{exponential} function of \(x\). Applying an
+exponential fit to the untransformed variables corroborates this
+finding.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{plt.figure(dpi}\OperatorTok{=}\DecValTok{120}\NormalTok{, figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{4}\NormalTok{, }\DecValTok{3}\NormalTok{))}
+
+\NormalTok{plt.scatter(x, y)}
+\NormalTok{plt.plot(x, np.exp(theta\_0) }\OperatorTok{*}\NormalTok{ np.exp(theta\_1 }\OperatorTok{*}\NormalTok{ x), }\StringTok{"tab:red"}\NormalTok{)}
+\NormalTok{plt.xlabel(}\StringTok{"Length"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Age"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Text(0, 0.5, 'Age')
+\end{verbatim}
+
+\includegraphics{constant_model_loss_transformations/loss_transformations_files/figure-pdf/cell-20-output-2.pdf}
+
+You may wonder: why did we choose to apply a log transformation
+specifically? Why not some other function to linearize the data?
+
+Practically, many other mathematical operations that modify the relative
+scales of \texttt{"Age"} and \texttt{"Length"} could have worked here.
+
+\section{Multiple Linear Regression}\label{multiple-linear-regression}
+
+Multiple linear regression is an extension of simple linear regression
+that adds additional features to the model. The multiple linear
+regression model takes the form:
+
+\[\hat{y} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:...\:+\:\theta_p x_{p}\]
+
+Our predicted value of \(y\), \(\hat{y}\), is a linear combination of
+the single \textbf{observations} (features), \(x_i\), and the
+parameters, \(\theta_i\).
+
+We'll dive deeper into Multiple Linear Regression in the next lecture.
+
+\section{Bonus: Calculating Constant Model MSE Using an Algebraic
+Trick}\label{bonus-calculating-constant-model-mse-using-an-algebraic-trick}
+
+Earlier, we calculated the constant model MSE using calculus. It turns
+out that there is a much more elegant way of performing this same
+minimization algebraically, without using calculus at all.
+
+In this calculation, we use the fact that the \textbf{sum of deviations
+from the mean is \(0\)} or that \(\sum_{i=1}^{n} (y_i - \bar{y}) = 0\).
+
+Let's quickly walk through the proof for this: \[
+\begin{align}
+\sum_{i=1}^{n} (y_i - \bar{y}) &= \sum_{i=1}^{n} y_i - \sum_{i=1}^{n} \bar{y} \\
+ &= \sum_{i=1}^{n} y_i - n\bar{y} \\
+ &= \sum_{i=1}^{n} y_i - n\frac{1}{n}\sum_{i=1}^{n}y_i \\
+ &= \sum_{i=1}^{n} y_i - \sum_{i=1}^{n}y_i \\
+ & = 0
+\end{align}
+\]
+
+In our calculations, we'll also be using the definition of the variance
+as a sample. As a refresher:
+
+\[\sigma_y^2 = \frac{1}{n}\sum_{i=1}^{n} (y_i - \bar{y})^2\]
+
+Getting into our calculation for MSE minimization:
+
+\[
+\begin{align}
+R(\theta) &= {\frac{1}{n}}\sum^{n}_{i=1} (y_i - \theta)^2
+\\ &= \frac{1}{n}\sum^{n}_{i=1} [(y_i - \bar{y}) + (\bar{y} - \theta)]^2\quad \quad \text{using trick that a-b can be written as (a-c) + (c-b) } \\
+&\quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \space \space \text{where a, b, and c are any numbers}
+\\ &= \frac{1}{n}\sum^{n}_{i=1} [(y_i - \bar{y})^2 + 2(y_i - \bar{y})(\bar{y} - \theta) + (\bar{y} - \theta)^2]
+\\ &= \frac{1}{n}[\sum^{n}_{i=1}(y_i - \bar{y})^2 + 2(\bar{y} - \theta)\sum^{n}_{i=1}(y_i - \bar{y}) + n(\bar{y} - \theta)^2] \quad \quad  \text{distribute sum to individual terms}
+\\ &= \frac{1}{n}\sum^{n}_{i=1}(y_i - \bar{y})^2 + \frac{2}{n}(\bar{y} - \theta)\cdot0 + (\bar{y} - \theta)^2 \quad \quad  \text{sum of deviations from mean is 0}
+\\ &= \sigma_y^2 + (\bar{y} - \theta)^2
+\end{align}
+\]
+
+Since variance can't be negative, we know that our first term,
+\(\sigma_y^2\) is greater than or equal to \(0\). Also note, that
+\textbf{the first term doesn't involve \(\theta\) at all}, meaning
+changing our model won't change this value. For the purposes of
+determining \$\hat{\theta}\#, we can then essentially ignore this term.
+
+Looking at the second term, \((\bar{y} - \theta)^2\), since it is
+squared, we know it must be greater than or equal to \(0\). As this term
+does involve \(\theta\), picking the value of \(\theta\) that minimizes
+this term will allow us to minimize our average loss. For the second
+term to equal \(0\), \(\theta = \bar{y}\), or in other words,
+\(\hat{\theta} = \bar{y} = mean(y)\).
+
+\paragraph{Note}\label{note}
+
+In the derivation above, we decompose the expected loss, \(R(\theta)\),
+into two key components: the variance of the data, \(\sigma_y^2\), and
+the square of the bias, \((\bar{y} - \theta)^2\). This decomposition is
+insightful for understanding the behavior of estimators in statistical
+models.
+
+\begin{itemize}
+\item
+  \textbf{Variance, \(\sigma_y^2\)}: This term represents the spread of
+  the data points around their mean, \(\bar{y}\), and is a measure of
+  the data's inherent variability. Importantly, it does not depend on
+  the choice of \(\theta\), meaning it's a fixed property of the data.
+  Variance serves as an indicator of the data's dispersion and is
+  crucial in understanding the dataset's structure, but it remains
+  constant regardless of how we adjust our model parameter \(\theta\).
+\item
+  \textbf{Bias Squared, \((\bar{y} - \theta)^2\)}: This term captures
+  the bias of the estimator, defined as the square of the difference
+  between the mean of the data points, \(\bar{y}\), and the parameter
+  \(\theta\). The bias quantifies the systematic error introduced when
+  estimating \(\theta\). Minimizing this term is essential for improving
+  the accuracy of the estimator. When \(\theta = \bar{y}\), the bias is
+  \(0\), indicating that the estimator is unbiased for the parameter it
+  estimates. This highlights a critical principle in statistical
+  estimation: choosing \(\theta\) to be the sample mean, \(\bar{y}\),
+  minimizes the average loss, rendering the estimator both efficient and
+  unbiased for the population mean.
+\end{itemize}
+
+\bookmarksetup{startatroot}
+
+\chapter{Ordinary Least Squares}\label{ordinary-least-squares}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Define linearity with respect to a vector of parameters \(\theta\).
+\item
+  Understand the use of matrix notation to express multiple linear
+  regression.
+\item
+  Interpret ordinary least squares as the minimization of the norm of
+  the residual vector.
+\item
+  Compute performance metrics for multiple linear regression.
+\end{itemize}
+
+\end{tcolorbox}
+
+We've now spent a number of lectures exploring how to build effective
+models -- we introduced the SLR and constant models, selected cost
+functions to suit our modeling task, and applied transformations to
+improve the linear fit.
+
+Throughout all of this, we considered models of one feature
+(\(\hat{y}_i = \theta_0 + \theta_1 x_i\)) or zero features
+(\(\hat{y}_i = \theta_0\)). As data scientists, we usually have access
+to datasets containing \emph{many} features. To make the best models we
+can, it will be beneficial to consider all of the variables available to
+us as inputs to a model, rather than just one. In today's lecture, we'll
+introduce \textbf{multiple linear regression} as a framework to
+incorporate multiple features into a model. We will also learn how to
+accelerate the modeling process -- specifically, we'll see how linear
+algebra offers us a powerful set of tools for understanding model
+performance.
+
+\section{OLS Problem Formulation}\label{ols-problem-formulation}
+
+\subsection{Multiple Linear
+Regression}\label{multiple-linear-regression-1}
+
+Multiple linear regression is an extension of simple linear regression
+that adds additional features to the model. The multiple linear
+regression model takes the form:
+
+\[\hat{y} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:...\:+\:\theta_p x_{p}\]
+
+Our predicted value of \(y\), \(\hat{y}\), is a linear combination of
+the single \textbf{observations} (features), \(x_i\), and the
+parameters, \(\theta_i\).
+
+We can explore this idea further by looking at a dataset containing
+aggregate per-player data from the 2018-19 NBA season, downloaded from
+\href{https://www.kaggle.com/schmadam97/nba-regular-season-stats-20182019}{Kaggle}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\NormalTok{nba }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{\textquotesingle{}data/nba18{-}19.csv\textquotesingle{}}\NormalTok{, index\_col}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+\NormalTok{nba.index.name }\OperatorTok{=} \VariableTok{None} \CommentTok{\# Drops name of index (players are ordered by rank)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{nba.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllllllllllllllll@{}}
+\toprule\noalign{}
+& Player & Pos & Age & Tm & G & GS & MP & FG & FGA & FG\% & ... & FT\% &
+ORB & DRB & TRB & AST & STL & BLK & TOV & PF & PTS \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1 & Álex Abrines\textbackslash abrinal01 & SG & 25 & OKC & 31 & 2 & 19.0
+& 1.8 & 5.1 & 0.357 & ... & 0.923 & 0.2 & 1.4 & 1.5 & 0.6 & 0.5 & 0.2 &
+0.5 & 1.7 & 5.3 \\
+2 & Quincy Acy\textbackslash acyqu01 & PF & 28 & PHO & 10 & 0 & 12.3 &
+0.4 & 1.8 & 0.222 & ... & 0.700 & 0.3 & 2.2 & 2.5 & 0.8 & 0.1 & 0.4 &
+0.4 & 2.4 & 1.7 \\
+3 & Jaylen Adams\textbackslash adamsja01 & PG & 22 & ATL & 34 & 1 & 12.6
+& 1.1 & 3.2 & 0.345 & ... & 0.778 & 0.3 & 1.4 & 1.8 & 1.9 & 0.4 & 0.1 &
+0.8 & 1.3 & 3.2 \\
+4 & Steven Adams\textbackslash adamsst01 & C & 25 & OKC & 80 & 80 & 33.4
+& 6.0 & 10.1 & 0.595 & ... & 0.500 & 4.9 & 4.6 & 9.5 & 1.6 & 1.5 & 1.0 &
+1.7 & 2.6 & 13.9 \\
+5 & Bam Adebayo\textbackslash adebaba01 & C & 21 & MIA & 82 & 28 & 23.3
+& 3.4 & 5.9 & 0.576 & ... & 0.735 & 2.0 & 5.3 & 7.3 & 2.2 & 0.9 & 0.8 &
+1.5 & 2.5 & 8.9 \\
+\end{longtable}
+
+Let's say we are interested in predicting the number of points
+(\texttt{PTS}) an athlete will score in a basketball game this season.
+
+Suppose we want to fit a linear model by using some characteristics, or
+\textbf{features} of a player. Specifically, we'll focus on field goals,
+assists, and 3-point attempts.
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{FG}, the average number of (2-point) field goals per game
+\item
+  \texttt{AST}, the average number of assists per game
+\item
+  \texttt{3PA}, the average number of 3-point field goals attempted per
+  game
+\end{itemize}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{nba[[}\StringTok{\textquotesingle{}FG\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}AST\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}3PA\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}PTS\textquotesingle{}}\NormalTok{]].head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& FG & AST & 3PA & PTS \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+1 & 1.8 & 0.6 & 4.1 & 5.3 \\
+2 & 0.4 & 0.8 & 1.5 & 1.7 \\
+3 & 1.1 & 1.9 & 2.2 & 3.2 \\
+4 & 6.0 & 1.6 & 0.0 & 13.9 \\
+5 & 3.4 & 2.2 & 0.2 & 8.9 \\
+\end{longtable}
+
+Because we are now dealing with many parameter values, we've collected
+them all into a \textbf{parameter vector} with dimensions
+\((p+1) \times 1\) to keep things tidy. Remember that \(p\) represents
+the number of features we have (in this case, 3).
+
+\[\theta = \begin{bmatrix}
+           \theta_{0} \\
+           \theta_{1} \\
+           \vdots \\
+           \theta_{p}
+         \end{bmatrix}\]
+
+We are working with two vectors here: a row vector representing the
+observed data, and a column vector containing the model parameters. The
+multiple linear regression model is \textbf{equivalent to the dot
+(scalar) product of the observation vector and parameter vector}.
+
+\[[1,\:x_{1},\:x_{2},\:x_{3},\:...,\:x_{p}] \theta = [1,\:x_{1},\:x_{2},\:x_{3},\:...,\:x_{p}] \begin{bmatrix}
+           \theta_{0} \\
+           \theta_{1} \\
+           \vdots \\
+           \theta_{p}
+         \end{bmatrix} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:...\:+\:\theta_p x_{p}\]
+
+Notice that we have inserted 1 as the first value in the observation
+vector. When the dot product is computed, this 1 will be multiplied with
+\(\theta_0\) to give the intercept of the regression model. We call this
+1 entry the \textbf{intercept} or \textbf{bias} term.
+
+Given that we have three features here, we can express this model as:
+\[\hat{y} = \theta_0\:+\:\theta_1x_{1}\:+\:\theta_2 x_{2}\:+\:\theta_3 x_{3}\]
+
+Our features are represented by \(x_1\) (\texttt{FG}), \(x_2\)
+(\texttt{AST}), and \(x_3\) (\texttt{3PA}) with each having
+correpsonding parameters, \(\theta_1\), \(\theta_2\), and \(\theta_3\).
+
+In statistics, this model + loss is called \textbf{Ordinary Least
+Squares (OLS)}. The solution to OLS is the minimizing loss for
+parameters \(\hat{\theta}\), also called the \textbf{least squares
+estimate}.
+
+\subsection{Linear Algebra Approach}\label{linear-algebra-approach}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Linear Algebra Review: Vector Dot Product}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+The \textbf{dot product (or inner product)} is a vector operation that:
+
+\begin{itemize}
+\tightlist
+\item
+  Can only be carried out on two vectors of the \textbf{same length}
+\item
+  Sums up the products of the corresponding entries of the two vectors
+\item
+  Returns a single number
+\end{itemize}
+
+For example, let \[ 
+\begin{align}
+\vec{u} = \begin{bmatrix}1 \\ 2 \\ 3\end{bmatrix}, \vec{v} = \begin{bmatrix}1 \\ 1 \\ 1\end{bmatrix}
+\end{align}
+\]
+
+The dot product between \(\vec{u}\) and \(\vec{v}\) is \[
+\begin{align}
+\vec{u} \cdot \vec{v} &= \vec{u}^T \vec{v} = \vec{v}^T \vec{u} \\
+  &= 1 \cdot 1 + 2 \cdot 1 + 3 \cdot 1 \\ 
+  &= 6
+\end{align}
+\]
+
+While not in scope, note that we can also interpret the dot product
+geometrically:
+
+\begin{itemize}
+\tightlist
+\item
+  It is the product of three things: the \textbf{magnitude} of both
+  vectors, and the \textbf{cosine} of the angles between them:
+  \[\vec{u} \cdot \vec{v} = ||\vec{u}|| \cdot ||\vec{v}|| \cdot {cos \theta}\]
+\end{itemize}
+
+\end{tcolorbox}
+
+We now know how to generate a single prediction from multiple observed
+features. Data scientists usually work at scale -- that is, they want to
+build models that can produce many predictions, all at once. The vector
+notation we introduced above gives us a hint on how we can expedite
+multiple linear regression. We want to use the tools of linear algebra.
+
+Let's think about how we can apply what we did above. To accommodate for
+the fact that we're considering several feature variables, we'll adjust
+our notation slightly. Each observation can now be thought of as a row
+vector with an entry for each of \(p\) features.
+
+To make a prediction from the \emph{first} observation in the data, we
+take the dot product of the parameter vector and \emph{first}
+observation vector. To make a prediction from the \emph{second}
+observation, we would repeat this process to find the dot product of the
+parameter vector and the \emph{second} observation vector. If we wanted
+to find the model predictions for each observation in the dataset, we'd
+repeat this process for all \(n\) observations in the data.
+
+\[\hat{y}_1 = \theta_0 + \theta_1 x_{11} + \theta_2 x_{12} + ... + \theta_p x_{1p} = [1,\:x_{11},\:x_{12},\:x_{13},\:...,\:x_{1p}] \theta\]
+\[\hat{y}_2 = \theta_0 + \theta_1 x_{21} + \theta_2 x_{22} + ... + \theta_p x_{2p} = [1,\:x_{21},\:x_{22},\:x_{23},\:...,\:x_{2p}] \theta\]
+\[\vdots\]
+\[\hat{y}_n = \theta_0 + \theta_1 x_{n1} + \theta_2 x_{n2} + ... + \theta_p x_{np} = [1,\:x_{n1},\:x_{n2},\:x_{n3},\:...,\:x_{np}] \theta\]
+
+Our observed data is represented by \(n\) row vectors, each with
+dimension \((p+1)\). We can collect them all into a single matrix, which
+we call \(\mathbb{X}\).
+
+The matrix \(\mathbb{X}\) is known as the \textbf{design matrix}. It
+contains all observed data for each of our \(p\) features, where each
+\textbf{row} corresponds to one \textbf{observation}, and each
+\textbf{column} corresponds to a \textbf{feature}. It often (but not
+always) contains an additional column of all ones to represent the
+\textbf{intercept} or \textbf{bias column}.
+
+To review what is happening in the design matrix: each row represents a
+single observation. For example, a student in Data 100. Each column
+represents a feature. For example, the ages of students in Data 100.
+This convention allows us to easily transfer our previous work in
+DataFrames over to this new linear algebra perspective.
+
+The multiple linear regression model can then be restated in terms of
+matrices: \[
+\Large
+\mathbb{\hat{Y}} = \mathbb{X} \theta
+\]
+
+Here, \(\mathbb{\hat{Y}}\) is the \textbf{prediction vector} with \(n\)
+elements (\(\mathbb{\hat{Y}} \in \mathbb{R}^{n}\)); it contains the
+prediction made by the model for each of the \(n\) input observations.
+\(\mathbb{X}\) is the \textbf{design matrix} with dimensions
+\(\mathbb{X} \in \mathbb{R}^{n \times (p + 1)}\), and \(\theta\) is the
+\textbf{parameter vector} with dimensions
+\(\theta \in \mathbb{R}^{(p + 1)}\). Note that our \textbf{true output}
+\(\mathbb{Y}\) is also a vector with \(n\) elements
+(\(\mathbb{Y} \in \mathbb{R}^{n}\)).
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Linear Algebra Review: Linearity}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+An expression is \textbf{linear in \(\theta\)} (a set of parameters) if
+it is a linear combination of the elements of the set. Checking if an
+expression can separate into a matrix product of two terms -- a
+\textbf{vector of \(\theta\)} s, and a matrix/vector \textbf{not
+involving \(\theta\)} -- is a good indicator of linearity.
+
+For example, consider the vector
+\(\theta = [\theta_0, \theta_1, \theta_2]\)
+
+\begin{itemize}
+\tightlist
+\item
+  \(\hat{y} = \theta_0 + 2\theta_1 + 3\theta_2\) is linear in theta, and
+  we can separate it into a matrix product of two terms:
+\end{itemize}
+
+\[\hat{y} = \begin{bmatrix} 1 \space 2 \space 3 \end{bmatrix} \begin{bmatrix} \theta_0 \\ \theta_1 \\ \theta_2 \end{bmatrix}\]
+
+\begin{itemize}
+\tightlist
+\item
+  \(\hat{y} = \theta_0\theta_1 + 2\theta_1^2 + 3log(\theta_2)\) is
+  \emph{not} linear in theta, as the \(\theta_1\) term is squared, and
+  the \(\theta_2\) term is logged. We cannot separate it into a matrix
+  product of two terms.
+\end{itemize}
+
+\end{tcolorbox}
+
+\subsection{Mean Squared Error}\label{mean-squared-error}
+
+We now have a new approach to understanding models in terms of vectors
+and matrices. To accompany this new convention, we should update our
+understanding of risk functions and model fitting.
+
+Recall our definition of MSE:
+\[R(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2\]
+
+At its heart, the MSE is a measure of \emph{distance} -- it gives an
+indication of how ``far away'' the predictions are from the true values,
+on average.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Linear Algebra: L2 Norm}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+When working with vectors, this idea of ``distance'' or the vector's
+\textbf{size/length} is represented by the \textbf{norm}. More
+precisely, the distance between two vectors \(\vec{a}\) and \(\vec{b}\)
+can be expressed as:
+\[||\vec{a} - \vec{b}||_2 = \sqrt{(a_1 - b_1)^2 + (a_2 - b_2)^2 + \ldots + (a_n - b_n)^2} = \sqrt{\sum_{i=1}^n (a_i - b_i)^2}\]
+
+The double bars are mathematical notation for the norm. The subscript 2
+indicates that we are computing the L2, or squared norm.
+
+The two norms we need to know for Data 100 are the L1 and L2 norms
+(sound familiar?). In this note, we'll focus on L2 norm. We'll dive into
+L1 norm in future lectures.
+
+For the n-dimensional vector
+\[\vec{x} = \begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_n \end{bmatrix}\]
+its \textbf{L2 vector norm} is
+
+\[||\vec{x}||_2 = \sqrt{(x_1)^2 + (x_2)^2 + \ldots + (x_n)^2} = \sqrt{\sum_{i=1}^n (x_i)^2}\]
+
+The L2 vector norm is a generalization of the Pythagorean theorem in
+\(n\) dimensions. Thus, it can be used as a measure of the
+\textbf{length} of a vector or even as a measure of the
+\textbf{distance} between two vectors.
+
+\end{tcolorbox}
+
+We can express the MSE as a squared L2 norm if we rewrite it in terms of
+the prediction vector, \(\hat{\mathbb{Y}}\), and true target vector,
+\(\mathbb{Y}\):
+
+\[R(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2 = \frac{1}{n} (||\mathbb{Y} - \hat{\mathbb{Y}}||_2)^2\]
+
+Here, the superscript 2 outside of the parentheses means that we are
+\emph{squaring} the norm. If we plug in our linear model
+\(\hat{\mathbb{Y}} = \mathbb{X} \theta\), we find the MSE cost function
+in vector notation:
+
+\[R(\theta) = \frac{1}{n} (||\mathbb{Y} - \mathbb{X} \theta||_2)^2\]
+
+Under the linear algebra perspective, our new task is to fit the optimal
+parameter vector \(\theta\) such that the cost function is minimized.
+Equivalently, we wish to minimize the norm
+\[||\mathbb{Y} - \mathbb{X} \theta||_2 = ||\mathbb{Y} - \hat{\mathbb{Y}}||_2.\]
+
+We can restate this goal in two ways:
+
+\begin{itemize}
+\tightlist
+\item
+  Minimize the \textbf{distance} between the vector of true values,
+  \(\mathbb{Y}\), and the vector of predicted values,
+  \(\mathbb{\hat{Y}}\)
+\item
+  Minimize the \textbf{length} of the \textbf{residual vector}, defined
+  as: \[e = \mathbb{Y} - \mathbb{\hat{Y}} = \begin{bmatrix}
+           y_1 - \hat{y}_1 \\
+           y_2 - \hat{y}_2 \\
+           \vdots \\
+           y_n - \hat{y}_n
+         \end{bmatrix}\]
+\end{itemize}
+
+\subsection{A Note on Terminology for Multiple Linear
+Regression}\label{a-note-on-terminology-for-multiple-linear-regression}
+
+There are several equivalent terms in the context of regression. The
+ones we use most often for this course are bolded.
+
+\begin{itemize}
+\tightlist
+\item
+  \(x\) can be called a
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \textbf{Feature(s)}
+  \item
+    Covariate(s)
+  \item
+    \textbf{Independent variable(s)}
+  \item
+    Explanatory variable(s)
+  \item
+    Predictor(s)
+  \item
+    Input(s)
+  \item
+    Regressor(s)
+  \end{itemize}
+\item
+  \(y\) can be called an
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \textbf{Output}
+  \item
+    Outcome
+  \item
+    \textbf{Response}
+  \item
+    Dependent variable
+  \end{itemize}
+\item
+  \(\hat{y}\) can be called a
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \textbf{Prediction}
+  \item
+    Predicted response
+  \item
+    Estimated value
+  \end{itemize}
+\item
+  \(\theta\) can be called a
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \textbf{Weight(s)}
+  \item
+    \textbf{Parameter(s)}
+  \item
+    Coefficient(s)
+  \end{itemize}
+\item
+  \(\hat{\theta}\) can be called a
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \textbf{Estimator(s)}
+  \item
+    \textbf{Optimal parameter(s)}
+  \end{itemize}
+\item
+  A datapoint \((x, y)\) is also called an observation.
+\end{itemize}
+
+\section{Geometric Derivation}\label{geometric-derivation}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Linear Algebra: Span}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+Recall that the \textbf{span} or \textbf{column space} of a matrix
+\(\mathbb{X}\) (denoted \(span(\mathbb{X})\)) is the set of all possible
+linear combinations of the matrix's columns. In other words, the span
+represents every point in space that could possibly be reached by adding
+and scaling some combination of the matrix columns. Additionally, if
+each column of \(\mathbb{X}\) has length \(n\), \(span(\mathbb{X})\) is
+a subspace of \(\mathbb{R}^{n}\).
+
+\end{tcolorbox}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Linear Algebra: Matrix-Vector Multiplication}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+There are 2 ways we can think about matrix-vector multiplication
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  So far, we've thought of our model as horizontally stacked predictions
+  per datapoint
+\item
+  However, it is helpful sometimes to think of matrix-vector
+  multiplication as performed by columns. We can also think of
+  \(\mathbb{Y}\) as a \emph{linear combination of feature vectors},
+  scaled by \emph{parameters}.
+\end{enumerate}
+
+\end{tcolorbox}
+
+Up until now, we've mostly thought of our model as a scalar product
+between horizontally stacked observations and the parameter vector. We
+can also think of \(\hat{\mathbb{Y}}\) as a \textbf{linear combination
+of feature vectors}, scaled by the \textbf{parameters}. We use the
+notation \(\mathbb{X}_{:, i}\) to denote the \(i\)th column of the
+design matrix. You can think of this as following the same convention as
+used when calling \texttt{.iloc} and \texttt{.loc}. ``:'' means that we
+are taking all entries in the \(i\)th column.
+
+\[
+\hat{\mathbb{Y}} = 
+\theta_0 \begin{bmatrix}
+           1 \\
+           1 \\
+           \vdots \\
+           1
+         \end{bmatrix} + \theta_1 \begin{bmatrix}
+           x_{11} \\
+           x_{21} \\
+           \vdots \\
+           x_{n1}
+         \end{bmatrix} + \ldots + \theta_p \begin{bmatrix}
+           x_{1p} \\
+           x_{2p} \\
+           \vdots \\
+           x_{np}
+         \end{bmatrix}
+         = \theta_0 \mathbb{X}_{:,\:1} + \theta_1 \mathbb{X}_{:,\:2} + \ldots + \theta_p \mathbb{X}_{:,\:p+1}\]
+
+This new approach is useful because it allows us to take advantage of
+the properties of linear combinations.
+
+Because the prediction vector, \(\hat{\mathbb{Y}} = \mathbb{X} \theta\),
+is a \textbf{linear combination} of the columns of \(\mathbb{X}\), we
+know that the \textbf{predictions are contained in the span of
+\(\mathbb{X}\)}. That is, we know that
+\(\mathbb{\hat{Y}} \in \text{Span}(\mathbb{X})\).
+
+The diagram below is a simplified view of \(\text{Span}(\mathbb{X})\),
+assuming that each column of \(\mathbb{X}\) has length \(n\). Notice
+that the columns of \(\mathbb{X}\) define a subspace of
+\(\mathbb{R}^n\), where each point in the subspace can be reached by a
+linear combination of \(\mathbb{X}\)'s columns. The prediction vector
+\(\mathbb{\hat{Y}}\) lies somewhere in this subspace.
+
+Examining this diagram, we find a problem. The vector of true values,
+\(\mathbb{Y}\), could theoretically lie \emph{anywhere} in
+\(\mathbb{R}^n\) space -- its exact location depends on the data we
+collect out in the real world. However, our multiple linear regression
+model can only make predictions in the subspace of \(\mathbb{R}^n\)
+spanned by \(\mathbb{X}\). Remember the model fitting goal we
+established in the previous section: we want to generate predictions
+such that the distance between the vector of true values,
+\(\mathbb{Y}\), and the vector of predicted values,
+\(\mathbb{\hat{Y}}\), is minimized. This means that \textbf{we want
+\(\mathbb{\hat{Y}}\) to be the vector in \(\text{Span}(\mathbb{X})\)
+that is closest to \(\mathbb{Y}\)}.
+
+Another way of rephrasing this goal is to say that we wish to minimize
+the length of the residual vector \(e\), as measured by its \(L_2\)
+norm.
+
+The vector in \(\text{Span}(\mathbb{X})\) that is closest to
+\(\mathbb{Y}\) is always the \textbf{orthogonal projection} of
+\(\mathbb{Y}\) onto \(\text{Span}(\mathbb{X}).\) Thus, we should choose
+the parameter vector \(\theta\) that makes the \textbf{residual vector
+orthogonal to any vector in \(\text{Span}(\mathbb{X})\)}. You can
+visualize this as the vector created by dropping a perpendicular line
+from \(\mathbb{Y}\) onto the span of \(\mathbb{X}\).
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Linear Algebra: Orthogonality}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+Recall that two vectors \(\vec{a}\) and \(\vec{b}\) are orthogonal if
+their dot product is zero: \(\vec{a}^{T}\vec{b} = 0\).
+
+A vector \(v\) is \textbf{orthogonal} to the span of a matrix \(M\) if
+and only if \(v\) is orthogonal to \textbf{each column} in \(M\). Put
+together, a vector \(v\) is orthogonal to \(\text{Span}(M)\) if:
+
+\[M^Tv = \vec{0}\]
+
+Note that \(\vec{0}\) represents the \textbf{zero vector}, a
+\(d\)-length vector full of 0s.
+
+\end{tcolorbox}
+
+Remember our goal is to find \(\hat{\theta}\) such that we minimize the
+objective function \(R(\theta)\). Equivalently, this is the
+\(\hat{\theta}\) such that the residual vector
+\(e = \mathbb{Y} - \mathbb{X} \hat{\theta}\) is orthogonal to
+\(\text{Span}(\mathbb{X})\).
+
+Looking at the definition of orthogonality of
+\(\mathbb{Y} - \mathbb{X}\hat{\theta}\) to \(span(\mathbb{X})\), we can
+write: \[\mathbb{X}^T (\mathbb{Y} - \mathbb{X}\hat{\theta}) = \vec{0}\]
+
+Let's then rearrange the terms:
+\[\mathbb{X}^T \mathbb{Y} - \mathbb{X}^T \mathbb{X} \hat{\theta} = \vec{0}\]
+
+And finally, we end up with the \textbf{normal equation}:
+\[\mathbb{X}^T \mathbb{X} \hat{\theta} = \mathbb{X}^T \mathbb{Y}\]
+
+Any vector \(\theta\) that minimizes MSE on a dataset must satisfy this
+equation.
+
+If \(\mathbb{X}^T \mathbb{X}\) is invertible, we can conclude:
+\[\hat{\theta} = (\mathbb{X}^T \mathbb{X})^{-1} \mathbb{X}^T \mathbb{Y}\]
+
+This is called the \textbf{least squares estimate} of \(\theta\): it is
+the value of \(\theta\) that minimizes the squared loss.
+
+Note that the least squares estimate was derived under the assumption
+that \(\mathbb{X}^T \mathbb{X}\) is \emph{invertible}. This condition
+holds true when \(\mathbb{X}^T \mathbb{X}\) is full column rank, which,
+in turn, happens when \(\mathbb{X}\) is full column rank. The proof for
+why \(\mathbb{X}\) needs to be full column rank is optional and in the
+Bonus section at the end.
+
+\section{Evaluating Model
+Performance}\label{evaluating-model-performance}
+
+Our geometric view of multiple linear regression has taken us far! We
+have identified the optimal set of parameter values to minimize MSE in a
+model of multiple features. Now, we want to understand how well our
+fitted model performs.
+
+\subsection{RMSE}\label{rmse}
+
+One measure of model performance is the \textbf{Root Mean Squared
+Error}, or RMSE. The RMSE is simply the square root of MSE. Taking the
+square root converts the value back into the original, non-squared units
+of \(y_i\), which is useful for understanding the model's performance. A
+low RMSE indicates more ``accurate'' predictions -- that there is a
+lower average loss across the dataset.
+
+\[\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2}\]
+
+\subsection{Residual Plots}\label{residual-plots}
+
+When working with SLR, we generated plots of the residuals against a
+single feature to understand the behavior of residuals. When working
+with several features in multiple linear regression, it no longer makes
+sense to consider a single feature in our residual plots. Instead,
+multiple linear regression is evaluated by making plots of the residuals
+against the predicted values. As was the case with SLR, a multiple
+linear model performs well if its residual plot shows no patterns.
+
+\subsection{\texorpdfstring{Multiple
+\(R^2\)}{Multiple R\^{}2}}\label{multiple-r2}
+
+For SLR, we used the correlation coefficient to capture the association
+between the target variable and a single feature variable. In a multiple
+linear model setting, we will need a performance metric that can account
+for multiple features at once. \textbf{Multiple \(R^2\)}, also called
+the \textbf{coefficient of determination}, is the \textbf{proportion of
+variance} of our \textbf{fitted values} (predictions) \(\hat{y}_i\) to
+our true values \(y_i\). It ranges from 0 to 1 and is effectively the
+\emph{proportion} of variance in the observations that the \textbf{model
+explains}.
+
+\[R^2 = \frac{\text{variance of } \hat{y}_i}{\text{variance of } y_i} = \frac{\sigma^2_{\hat{y}}}{\sigma^2_y}\]
+
+Note that for OLS with an intercept term, for example
+\(\hat{y} = \theta_0 + \theta_1x_1 + \theta_2x_2 + \cdots + \theta_px_p\),
+\(R^2\) is equal to the square of the correlation between \(y\) and
+\(\hat{y}\). On the other hand for SLR, \(R^2\) is equal to \(r^2\), the
+correlation between \(x\) and \(y\). The proof of these last two
+properties is out of scope for this course.
+
+Additionally, as we add more features, our fitted values tend to become
+closer and closer to our actual values. Thus, \(R^2\) increases.
+
+Adding more features doesn't always mean our model is better though!
+We'll see why later in the course.
+
+\section{OLS Properties}\label{ols-properties}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  When using the optimal parameter vector, our residuals
+  \(e = \mathbb{Y} - \hat{\mathbb{Y}}\) are orthogonal to
+  \(span(\mathbb{X})\).
+\end{enumerate}
+
+\[\mathbb{X}^Te = 0 \]
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-color-frame, left=2mm, breakable, rightrule=.15mm, bottomrule=.15mm, opacityback=0, toprule=.15mm, leftrule=.75mm, arc=.35mm, colback=white]
+
+Proof:
+
+\begin{itemize}
+\tightlist
+\item
+  The optimal parameter vector, \(\hat{\theta}\), solves the normal
+  equations
+  \(\implies \hat{\theta} = (\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y}\)
+\end{itemize}
+
+\[\mathbb{X}^Te = \mathbb{X}^T (\mathbb{Y} - \mathbb{\hat{Y}}) \]
+
+\[\mathbb{X}^T (\mathbb{Y} - \mathbb{X}\hat{\theta}) = \mathbb{X}^T\mathbb{Y} - \mathbb{X}^T\mathbb{X}\hat{\theta}\]
+
+\begin{itemize}
+\tightlist
+\item
+  Any matrix multiplied with its own inverse is the identity matrix
+  \(\mathbb{I}\)
+\end{itemize}
+
+\[\mathbb{X}^T\mathbb{Y} - (\mathbb{X}^T\mathbb{X})(\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y} = \mathbb{X}^T\mathbb{Y} - \mathbb{X}^T\mathbb{Y} = 0\]
+
+\end{tcolorbox}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{1}
+\tightlist
+\item
+  For all linear models with an \textbf{intercept term}, the \textbf{sum
+  of residuals is zero}.
+\end{enumerate}
+
+\[\sum_i^n e_i = 0\]
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-color-frame, left=2mm, breakable, rightrule=.15mm, bottomrule=.15mm, opacityback=0, toprule=.15mm, leftrule=.75mm, arc=.35mm, colback=white]
+
+Proof:
+
+\begin{itemize}
+\tightlist
+\item
+  For all linear models with an \textbf{intercept term}, the average of
+  the predicted \(y\) values is equal to the average of the true \(y\)
+  values. \[\bar{y} = \bar{\hat{y}}\]
+\item
+  Rewriting the sum of residuals as two separate sums,
+  \[\sum_i^n e_i = \sum_i^n y_i - \sum_i^n\hat{y}_i\]
+\item
+  Each respective sum is a multiple of the average of the sum.
+  \[\sum_i^n e_i = n\bar{y} - n\bar{y} = n(\bar{y} - \bar{y}) = 0\]
+\end{itemize}
+
+\end{tcolorbox}
+
+To summarize:
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.2500}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.2500}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.2500}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.2500}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Model
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Estimate
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Unique?
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Constant Model + MSE & \(\hat{y} = \theta_0\) &
+\(\hat{\theta}_0 = mean(y) = \bar{y}\) & \textbf{Yes}. Any set of values
+has a unique mean. \\
+Constant Model + MAE & \(\hat{y} = \theta_0\) &
+\(\hat{\theta}_0 = median(y)\) & \textbf{Yes}, if odd. \textbf{No}, if
+even. Return the average of the middle 2 values. \\
+Simple Linear Regression + MSE & \(\hat{y} = \theta_0 + \theta_1x\) &
+\(\hat{\theta}_0 = \bar{y} - \hat{\theta}_1\bar{x}\)
+\(\hat{\theta}_1 = r\frac{\sigma_y}{\sigma_x}\) & \textbf{Yes}. Any set
+of non-constant* values has a unique mean, SD, and correlation
+coefficient. \\
+\textbf{OLS} (Linear Model + MSE) &
+\(\mathbb{\hat{Y}} = \mathbb{X}\mathbb{\theta}\) &
+\(\hat{\theta} = (\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y}\) &
+\textbf{Yes}, if \(\mathbb{X}\) is full column rank (all columns are
+linearly independent, \# of datapoints
+\textgreater\textgreater\textgreater{} \# of features). \\
+\end{longtable}
+
+\section{Bonus: Uniqueness of the
+Solution}\label{bonus-uniqueness-of-the-solution}
+
+The Least Squares estimate \(\hat{\theta}\) is \textbf{unique} if and
+only if \(\mathbb{X}\) is \textbf{full column rank}.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-color-frame, left=2mm, breakable, rightrule=.15mm, bottomrule=.15mm, opacityback=0, toprule=.15mm, leftrule=.75mm, arc=.35mm, colback=white]
+
+Proof:
+
+\begin{itemize}
+\tightlist
+\item
+  We know the solution to the normal equation
+  \(\mathbb{X}^T\mathbb{X}\hat{\theta} = \mathbb{X}^T\mathbb{Y}\) is the
+  least square estimate that minimizes the squared loss.
+\item
+  \(\hat{\theta}\) has a \textbf{unique} solution \(\iff\) the square
+  matrix \(\mathbb{X}^T\mathbb{X}\) is \textbf{invertible} \(\iff\)
+  \(\mathbb{X}^T\mathbb{X}\) is full rank.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    The \textbf{column} rank of a square matrix is the max number of
+    linearly independent columns it contains.
+  \item
+    An \(n\) x \(n\) square matrix is deemed full column rank when all
+    of its columns are linearly independent. That is, its rank would be
+    equal to \(n\).
+  \item
+    \(\mathbb{X}^T\mathbb{X}\) has shape \((p + 1) \times (p + 1)\), and
+    therefore has max rank \(p + 1\).
+  \end{itemize}
+\item
+  \(rank(\mathbb{X}^T\mathbb{X})\) = \(rank(\mathbb{X})\) (proof out of
+  scope).
+\item
+  Therefore, \(\mathbb{X}^T\mathbb{X}\) has rank \(p + 1\) \(\iff\)
+  \(\mathbb{X}\) has rank \(p + 1\) \(\iff \mathbb{X}\) is full column
+  rank.
+\end{itemize}
+
+\end{tcolorbox}
+
+Therefore, if \(\mathbb{X}\) is not full column rank, we will not have
+unique estimates. This can happen for two major reasons.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  If our design matrix \(\mathbb{X}\) is ``\textbf{wide}'':
+
+  \begin{itemize}
+  \tightlist
+  \item
+    If n \textless{} p, then we have way more features (columns) than
+    observations (rows).
+  \item
+    Then \(rank(\mathbb{X})\) = min(n, p+1) \textless{} p+1, so
+    \(\hat{\theta}\) is not unique.
+  \item
+    Typically we have n \textgreater\textgreater{} p so this is less of
+    an issue.
+  \end{itemize}
+\item
+  If our design matrix \(\mathbb{X}\) has features that are
+  \textbf{linear combinations} of other features:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    By definition, rank of \(\mathbb{X}\) is number of linearly
+    independent columns in \(\mathbb{X}\).
+  \item
+    Example: If ``Width'', ``Height'', and ``Perimeter'' are all
+    columns,
+
+    \begin{itemize}
+    \tightlist
+    \item
+      Perimeter = 2 * Width + 2 * Height \(\rightarrow\) \(\mathbb{X}\)
+      is not full rank.
+    \end{itemize}
+  \item
+    Important with one-hot encoding (to discuss later).
+  \end{itemize}
+\end{enumerate}
+
+\bookmarksetup{startatroot}
+
+\chapter{sklearn and Gradient
+Descent}\label{sklearn-and-gradient-descent}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Apply the \texttt{sklearn} library for model creation and training
+\item
+  Optimizing complex models
+\item
+  Identifying cases where straight calculus or geometric arguments won't
+  help solve the loss function
+\item
+  Applying gradient descent for numerical optimization
+\end{itemize}
+
+\end{tcolorbox}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\ImportTok{import}\NormalTok{ plotly.express }\ImportTok{as}\NormalTok{ px}
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{from}\NormalTok{ sklearn.linear\_model }\ImportTok{import}\NormalTok{ LinearRegression}
+\NormalTok{pd.options.mode.chained\_assignment }\OperatorTok{=} \VariableTok{None}  \CommentTok{\# default=\textquotesingle{}warn\textquotesingle{}}
+\end{Highlighting}
+\end{Shaded}
+
+\section{\texorpdfstring{\texttt{sklearn}}{sklearn}}\label{sklearn}
+
+\subsection{Implementing Derived Formulas in
+Code}\label{implementing-derived-formulas-in-code}
+
+Throughout this lecture, we'll refer to the \texttt{penguins} dataset.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+
+\NormalTok{penguins }\OperatorTok{=}\NormalTok{ sns.load\_dataset(}\StringTok{"penguins"}\NormalTok{)}
+\NormalTok{penguins }\OperatorTok{=}\NormalTok{ penguins[penguins[}\StringTok{"species"}\NormalTok{] }\OperatorTok{==} \StringTok{"Adelie"}\NormalTok{].dropna()}
+\NormalTok{penguins.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& species & island & bill\_length\_mm & bill\_depth\_mm &
+flipper\_length\_mm & body\_mass\_g & sex \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & Adelie & Torgersen & 39.1 & 18.7 & 181.0 & 3750.0 & Male \\
+1 & Adelie & Torgersen & 39.5 & 17.4 & 186.0 & 3800.0 & Female \\
+2 & Adelie & Torgersen & 40.3 & 18.0 & 195.0 & 3250.0 & Female \\
+4 & Adelie & Torgersen & 36.7 & 19.3 & 193.0 & 3450.0 & Female \\
+5 & Adelie & Torgersen & 39.3 & 20.6 & 190.0 & 3650.0 & Male \\
+\end{longtable}
+
+Our goal will be to predict the value of the \texttt{"bill\_depth\_mm"}
+for a particular penguin given its \texttt{"flipper\_length\_mm"} and
+\texttt{"body\_mass\_g"}. We'll also add a bias column of all ones to
+represent the intercept term of our models.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Add a bias column of all ones to \textasciigrave{}penguins\textasciigrave{}}
+\NormalTok{penguins[}\StringTok{"bias"}\NormalTok{] }\OperatorTok{=}\NormalTok{ np.ones(}\BuiltInTok{len}\NormalTok{(penguins), dtype}\OperatorTok{=}\BuiltInTok{int}\NormalTok{) }
+
+\CommentTok{\# Define the design matrix, X...}
+\CommentTok{\# Note that we use .to\_numpy() to convert our DataFrame into a NumPy array so it is in Matrix form}
+\NormalTok{X }\OperatorTok{=}\NormalTok{ penguins[[}\StringTok{"bias"}\NormalTok{, }\StringTok{"flipper\_length\_mm"}\NormalTok{, }\StringTok{"body\_mass\_g"}\NormalTok{]].to\_numpy()}
+
+\CommentTok{\# ...as well as the target variable, Y}
+\CommentTok{\# Again, we use .to\_numpy() to convert our DataFrame into a NumPy array so it is in Matrix form}
+\NormalTok{Y }\OperatorTok{=}\NormalTok{ penguins[[}\StringTok{"bill\_depth\_mm"}\NormalTok{]].to\_numpy()}
+\end{Highlighting}
+\end{Shaded}
+
+In the lecture on ordinary least squares, we expressed multiple linear
+regression using matrix notation.
+
+\[\hat{\mathbb{Y}} = \mathbb{X}\theta\]
+
+We used a geometric approach to derive the following expression for the
+optimal model parameters:
+
+\[\hat{\theta} = (\mathbb{X}^T \mathbb{X})^{-1}\mathbb{X}^T \mathbb{Y}\]
+
+That's a whole lot of matrix manipulation. How do we implement it in
+\texttt{python}?
+
+There are three operations we need to perform here: multiplying
+matrices, taking transposes, and finding inverses.
+
+\begin{itemize}
+\tightlist
+\item
+  To perform matrix multiplication, use the \texttt{@} operator
+\item
+  To take a transpose, call the \texttt{.T} attribute of an
+  \texttt{NumPy} array or \texttt{DataFrame}
+\item
+  To compute an inverse, use \texttt{NumPy}'s in-built method
+  \texttt{np.linalg.inv}
+\end{itemize}
+
+Putting this all together, we can compute the OLS estimate for the
+optimal model parameters, stored in the array \texttt{theta\_hat}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{theta\_hat }\OperatorTok{=}\NormalTok{ np.linalg.inv(X.T }\OperatorTok{@}\NormalTok{ X) }\OperatorTok{@}\NormalTok{ X.T }\OperatorTok{@}\NormalTok{ Y}
+\NormalTok{theta\_hat}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([[1.10029953e+01],
+       [9.82848689e-03],
+       [1.47749591e-03]])
+\end{verbatim}
+
+To make predictions using our optimized parameter values, we
+matrix-multiply the design matrix with the parameter vector:
+
+\[\hat{\mathbb{Y}} = \mathbb{X}\theta\]
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{Y\_hat }\OperatorTok{=}\NormalTok{ X }\OperatorTok{@}\NormalTok{ theta\_hat}
+\NormalTok{pd.DataFrame(Y\_hat).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& 0 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 18.322561 \\
+1 & 18.445578 \\
+2 & 17.721412 \\
+3 & 17.997254 \\
+4 & 18.263268 \\
+\end{longtable}
+
+\subsection{\texorpdfstring{The \texttt{sklearn}
+Workflow}{The sklearn Workflow}}\label{the-sklearn-workflow}
+
+We've already saved a lot of time (and avoided tedious calculations) by
+translating our derived formulas into code. However, we still had to go
+through the process of writing out the linear algebra ourselves.
+
+To make life \emph{even easier}, we can turn to the \texttt{sklearn}
+\href{https://scikit-learn.org/stable/}{\texttt{python} library}.
+\texttt{sklearn} is a robust library of machine learning tools used
+extensively in research and industry. It is the standard for simple
+machine learning tasks and gives us a wide variety of in-built modeling
+frameworks and methods, so we'll keep returning to \texttt{sklearn}
+techniques as we progress through Data 100.
+
+Regardless of the specific type of model being implemented,
+\texttt{sklearn} follows a standard set of steps for creating a model:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  Import the \texttt{LinearRegression} model from \texttt{sklearn}
+
+\begin{verbatim}
+from sklearn.linear_model import LinearRegression
+\end{verbatim}
+\item
+  Create a model object. This generates a new instance of the model
+  class. You can think of it as making a new ``copy'' of a standard
+  ``template'' for a model. In code, this looks like:
+
+\begin{verbatim}
+my_model = LinearRegression()
+\end{verbatim}
+\item
+  Fit the model to the \texttt{X} design matrix and \texttt{Y} target
+  vector. This calculates the optimal model parameters ``behind the
+  scenes'' without us explicitly working through the calculations
+  ourselves. The fitted parameters are then stored within the model for
+  use in future predictions:
+
+\begin{verbatim}
+my_model.fit(X, Y)
+\end{verbatim}
+\item
+  Use the fitted model to make predictions on the \texttt{X} input data
+  using \texttt{.predict}.
+
+\begin{verbatim}
+my_model.predict(X)
+\end{verbatim}
+\end{enumerate}
+
+To extract the fitted parameters, we can use:
+
+\begin{verbatim}
+my_model.coef_
+
+my_model.intercept_
+\end{verbatim}
+
+Let's put this into action with our multiple regression task!
+
+\textbf{1. Initialize an instance of the model class}
+
+\texttt{sklearn} stores ``templates'' of useful models for machine
+learning. We begin the modeling process by making a ``copy'' of one of
+these templates for our own use. Model initialization looks like
+\texttt{ModelClass()}, where \texttt{ModelClass} is the type of model we
+wish to create.
+
+For now, let's create a linear regression model using
+\texttt{LinearRegression}.
+
+\texttt{my\_model} is now an instance of the \texttt{LinearRegression}
+class. You can think of it as the ``idea'' of a linear regression model.
+We haven't trained it yet, so it doesn't know any model parameters and
+cannot be used to make predictions. In fact, we haven't even told it
+what data to use for modeling! It simply waits for further instructions.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{my\_model }\OperatorTok{=}\NormalTok{ LinearRegression()}
+\end{Highlighting}
+\end{Shaded}
+
+\textbf{2. Train the model using \texttt{.fit}}
+
+Before the model can make predictions, we will need to fit it to our
+training data. When we fit the model, \texttt{sklearn} will run gradient
+descent behind the scenes to determine the optimal model parameters. It
+will then save these model parameters to our model instance for future
+use.
+
+All \texttt{sklearn} model classes include a \texttt{.fit} method, which
+is used to fit the model. It takes in two inputs: the design matrix,
+\texttt{X}, and the target variable, \texttt{Y}.
+
+Let's start by fitting a model with just one feature: the flipper
+length. We create a design matrix \texttt{X} by pulling out the
+\texttt{"flipper\_length\_mm"} column from the \texttt{DataFrame}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# .fit expects a 2D data design matrix, so we use double brackets to extract a DataFrame}
+\NormalTok{X }\OperatorTok{=}\NormalTok{ penguins[[}\StringTok{"flipper\_length\_mm"}\NormalTok{]]}
+\NormalTok{Y }\OperatorTok{=}\NormalTok{ penguins[}\StringTok{"bill\_depth\_mm"}\NormalTok{]}
+
+\NormalTok{my\_model.fit(X, Y)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+LinearRegression()
+\end{verbatim}
+
+Notice that we use \textbf{double brackets} to extract this column. Why
+double brackets instead of just single brackets? The \texttt{.fit}
+method, by default, expects to receive \textbf{2-dimensional} data --
+some kind of data that includes both rows and columns. Writing
+\texttt{penguins{[}"flipper\_length\_mm"{]}} would return a 1D
+\texttt{Series}, causing \texttt{sklearn} to error. We avoid this by
+writing \texttt{penguins{[}{[}"flipper\_length\_mm"{]}{]}} to produce a
+2D \texttt{DataFrame}.
+
+And in just three lines of code, our model has run gradient descent to
+determine the optimal model parameters! Our single-feature model takes
+the form:
+
+\[\text{bill depth} = \theta_0 + \theta_1 \text{flipper length}\]
+
+Note that \texttt{LinearRegression} will automatically include an
+intercept term.
+
+The fitted model parameters are stored as attributes of the model
+instance. \texttt{my\_model.intercept\_} will return the value of
+\(\hat{\theta}_0\) as a scalar. \texttt{my\_model.coef\_} will return
+all values \(\hat{\theta}_1,
+\hat{\theta}_1, ...\) in an array. Because our model only contains one
+feature, we see just the value of \(\hat{\theta}_1\) in the cell below.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# The intercept term, theta\_0}
+\NormalTok{my\_model.intercept\_}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+np.float64(7.297305899612313)
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# All parameters theta\_1, ..., theta\_p}
+\NormalTok{my\_model.coef\_}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([0.05812622])
+\end{verbatim}
+
+\textbf{3. Use the fitted model to make predictions}
+
+Now that the model has been trained, we can use it to make predictions!
+To do so, we use the \texttt{.predict} method. \texttt{.predict} takes
+in one argument: the design matrix that should be used to generate
+predictions. To understand how the model performs on the training set,
+we would pass in the training data. Alternatively, to make predictions
+on unseen data, we would pass in a new dataset that wasn't used to train
+the model.
+
+Below, we call \texttt{.predict} to generate model predictions on the
+original training data. As before, we use double brackets to ensure that
+we extract 2-dimensional data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{Y\_hat\_one\_feature }\OperatorTok{=}\NormalTok{ my\_model.predict(penguins[[}\StringTok{"flipper\_length\_mm"}\NormalTok{]])}
+
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"The RMSE of the model is }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{sqrt(np.mean((Y}\OperatorTok{{-}}\NormalTok{Y\_hat\_one\_feature)}\OperatorTok{**}\DecValTok{2}\NormalTok{))}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+The RMSE of the model is 1.154936309923901
+\end{verbatim}
+
+What if we wanted a model with two features?
+
+\[\text{bill depth} = \theta_0 + \theta_1 \text{flipper length} + \theta_2 \text{body mass}\]
+
+We repeat this three-step process by intializing a new model object,
+then calling \texttt{.fit} and \texttt{.predict} as before.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Step 1: initialize LinearRegression model}
+\NormalTok{two\_feature\_model }\OperatorTok{=}\NormalTok{ LinearRegression()}
+
+\CommentTok{\# Step 2: fit the model}
+\NormalTok{X\_two\_features }\OperatorTok{=}\NormalTok{ penguins[[}\StringTok{"flipper\_length\_mm"}\NormalTok{, }\StringTok{"body\_mass\_g"}\NormalTok{]]}
+\NormalTok{Y }\OperatorTok{=}\NormalTok{ penguins[}\StringTok{"bill\_depth\_mm"}\NormalTok{]}
+
+\NormalTok{two\_feature\_model.fit(X\_two\_features, Y)}
+
+\CommentTok{\# Step 3: make predictions}
+\NormalTok{Y\_hat\_two\_features }\OperatorTok{=}\NormalTok{ two\_feature\_model.predict(X\_two\_features)}
+
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"The RMSE of the model is }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{sqrt(np.mean((Y}\OperatorTok{{-}}\NormalTok{Y\_hat\_two\_features)}\OperatorTok{**}\DecValTok{2}\NormalTok{))}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+The RMSE of the model is 0.9881331104079043
+\end{verbatim}
+
+We can also see that we obtain the same predictions using
+\texttt{sklearn} as we did when applying the ordinary least squares
+formula before!
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.DataFrame(\{}\StringTok{"Y\_hat from OLS"}\NormalTok{:np.squeeze(Y\_hat), }\StringTok{"Y\_hat from sklearn"}\NormalTok{:Y\_hat\_two\_features\}).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Y\_hat from OLS & Y\_hat from sklearn \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 18.322561 & 18.322561 \\
+1 & 18.445578 & 18.445578 \\
+2 & 17.721412 & 17.721412 \\
+3 & 17.997254 & 17.997254 \\
+4 & 18.263268 & 18.263268 \\
+\end{longtable}
+
+\section{Gradient Descent}\label{gradient-descent}
+
+At this point, we've grown quite familiar with the process of choosing a
+model and a corresponding loss function and optimizing parameters by
+choosing the values of \(\theta\) that minimize the loss function. So
+far, we've optimized \(\theta\) by
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Using calculus to take the derivative of the loss function with
+  respect to \(\theta\), setting it equal to 0, and solving for
+  \(\theta\).
+\item
+  Using the geometric argument of orthogonality to derive the OLS
+  solution
+  \(\hat{\theta} = (\mathbb{X}^T \mathbb{X})^{-1}\mathbb{X}^T \mathbb{Y}\).
+\end{enumerate}
+
+One thing to note, however, is that the techniques we used above can
+only be applied if we make some big assumptions. For the calculus
+approach, we assumed that the loss function was differentiable at all
+points and that we could algebraically solve for the zero points of the
+derivative; for the geometric approach, OLS \emph{only} applies when
+using a linear model with MSE loss. What happens when we have more
+complex models with different, more complex loss functions? The
+techniques we've learned so far will not work, so we need a new
+optimization technique: \textbf{gradient descent}.
+
+\begin{quote}
+\textbf{BIG IDEA}: use an iterative algorithm to numerically compute the
+minimum of the loss.
+\end{quote}
+
+\subsection{Minimizing an Arbitrary 1D
+Function}\label{minimizing-an-arbitrary-1d-function}
+
+Let's consider an arbitrary function. Our goal is to find the value of
+\(x\) that minimizes this function.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ arbitrary(x):}
+    \ControlFlowTok{return}\NormalTok{ (x}\OperatorTok{**}\DecValTok{4} \OperatorTok{{-}} \DecValTok{15}\OperatorTok{*}\NormalTok{x}\OperatorTok{**}\DecValTok{3} \OperatorTok{+} \DecValTok{80}\OperatorTok{*}\NormalTok{x}\OperatorTok{**}\DecValTok{2} \OperatorTok{{-}} \DecValTok{180}\OperatorTok{*}\NormalTok{x }\OperatorTok{+} \DecValTok{144}\NormalTok{)}\OperatorTok{/}\DecValTok{10}
+\end{Highlighting}
+\end{Shaded}
+
+\subsubsection{The Naive Approach: Guess and
+Check}\label{the-naive-approach-guess-and-check}
+
+Above, we saw that the minimum is somewhere around 5.3. Let's see if we
+can figure out how to find the exact minimum algorithmically from
+scratch. One very slow (and terrible) way would be manual
+guess-and-check.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{arbitrary(}\DecValTok{6}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+0.0
+\end{verbatim}
+
+A somewhat better (but still slow) approach is to use brute force to try
+out a bunch of x values and return the one that yields the lowest loss.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ simple\_minimize(f, xs):}
+    \CommentTok{\# Takes in a function f and a set of values xs. }
+    \CommentTok{\# Calculates the value of the function f at all values x in xs}
+    \CommentTok{\# Takes the minimum value of f(x) and returns the corresponding value x }
+\NormalTok{    y }\OperatorTok{=}\NormalTok{ [f(x) }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ xs]  }
+    \ControlFlowTok{return}\NormalTok{ xs[np.argmin(y)]}
+
+\NormalTok{guesses }\OperatorTok{=}\NormalTok{ [}\FloatTok{5.3}\NormalTok{, }\FloatTok{5.31}\NormalTok{, }\FloatTok{5.32}\NormalTok{, }\FloatTok{5.33}\NormalTok{, }\FloatTok{5.34}\NormalTok{, }\FloatTok{5.35}\NormalTok{]}
+\NormalTok{simple\_minimize(arbitrary, guesses)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+5.33
+\end{verbatim}
+
+This process is essentially the same as before where we made a graphical
+plot, it's just that we're only looking at 20 selected points.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{xs }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{1}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{200}\NormalTok{)}
+\NormalTok{sparse\_xs }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{1}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{5}\NormalTok{)}
+
+\NormalTok{ys }\OperatorTok{=}\NormalTok{ arbitrary(xs)}
+\NormalTok{sparse\_ys }\OperatorTok{=}\NormalTok{ arbitrary(sparse\_xs)}
+
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.line(x }\OperatorTok{=}\NormalTok{ xs, y }\OperatorTok{=}\NormalTok{ arbitrary(xs))}
+\NormalTok{fig.add\_scatter(x }\OperatorTok{=}\NormalTok{ sparse\_xs, y }\OperatorTok{=}\NormalTok{ arbitrary(sparse\_xs), mode }\OperatorTok{=} \StringTok{"markers"}\NormalTok{)}
+\NormalTok{fig.update\_layout(showlegend}\OperatorTok{=} \VariableTok{False}\NormalTok{)}
+\NormalTok{fig.update\_layout(autosize}\OperatorTok{=}\VariableTok{False}\NormalTok{, width}\OperatorTok{=}\DecValTok{800}\NormalTok{, height}\OperatorTok{=}\DecValTok{600}\NormalTok{)}
+\NormalTok{fig.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+This basic approach suffers from three major flaws:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  If the minimum is outside our range of guesses, the answer will be
+  completely wrong.
+\item
+  Even if our range of guesses is correct, if the guesses are too
+  coarse, our answer will be inaccurate.
+\item
+  It is \emph{very} computationally inefficient, considering potentially
+  vast numbers of guesses that are useless.
+\end{enumerate}
+
+\subsubsection{\texorpdfstring{\texttt{Scipy.optimize.minimize}}{Scipy.optimize.minimize}}\label{scipy.optimize.minimize}
+
+One way to minimize this mathematical function is to use the
+\texttt{scipy.optimize.minimize} function. It takes a function and a
+starting guess and tries to find the minimum.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ scipy.optimize }\ImportTok{import}\NormalTok{ minimize}
+
+\CommentTok{\# takes a function f and a starting point x0 and returns a readout }
+\CommentTok{\# with the optimal input value of x which minimizes f}
+\NormalTok{minimize(arbitrary, x0 }\OperatorTok{=} \FloatTok{3.5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+  message: Optimization terminated successfully.
+  success: True
+   status: 0
+      fun: -0.13827491292966557
+        x: [ 2.393e+00]
+      nit: 3
+      jac: [ 6.486e-06]
+ hess_inv: [[ 7.385e-01]]
+     nfev: 20
+     njev: 10
+\end{verbatim}
+
+\texttt{scipy.optimize.minimize} is great. It may also seem a bit
+magical. How could you write a function that can find the minimum of any
+mathematical function? There are a number of ways to do this, which
+we'll explore in today's lecture, eventually arriving at the important
+idea of \textbf{gradient descent}, which is the principle that
+\texttt{scipy.optimize.minimize} uses.
+
+It turns out that under the hood, the \texttt{fit} method for
+\texttt{LinearRegression} models uses gradient descent. Gradient descent
+is also how much of machine learning works, including even advanced
+neural network models.
+
+In Data 100, the gradient descent process will usually be invisible to
+us, hidden beneath an abstraction layer. However, to be good data
+scientists, it's important that we know the underlying principles that
+optimization functions harness to find optimal parameters.
+
+\subsubsection{Digging into Gradient
+Descent}\label{digging-into-gradient-descent}
+
+Looking at the function across this domain, it is clear that the
+function's minimum value occurs around \(\theta = 5.3\). Let's pretend
+for a moment that we \emph{couldn't} see the full view of the cost
+function. How would we guess the value of \(\theta\) that minimizes the
+function?
+
+It turns out that the first derivative of the function can give us a
+clue. In the plots below, the line indicates the value of the derivative
+of each value of \(\theta\). The derivative is negative where it is red
+and positive where it is green.
+
+Say we make a guess for the minimizing value of \(\theta\). Remember
+that we read plots from left to right, and assume that our starting
+\(\theta\) value is to the left of the optimal \(\hat{\theta}\). If the
+guess ``undershoots'' the true minimizing value -- our guess for
+\(\theta\) is lower than the value of the \(\hat{\theta}\) that
+minimizes the function -- the derivative will be \textbf{negative}. This
+means that if we increase \(\theta\) (move further to the right), then
+we \textbf{can decrease} our loss function further. If this guess
+``overshoots'' the true minimizing value, the derivative will be
+positive, implying the converse.
+
+We can use this pattern to help formulate our next guess for the optimal
+\(\hat{\theta}\). Consider the case where we've undershot \(\theta\) by
+guessing too low of a value. We'll want our next guess to be greater in
+value than our previous guess -- that is, we want to shift our guess to
+the right. You can think of this as following the slope ``downhill'' to
+the function's minimum value.
+
+If we've overshot \(\hat{\theta}\) by guessing too high of a value,
+we'll want our next guess to be lower in value -- we want to shift our
+guess for \(\hat{\theta}\) to the left.
+
+In other words, the derivative of the function at each point tells us
+the direction of our next guess.
+
+\begin{itemize}
+\tightlist
+\item
+  A negative slope means we want to step to the right, or move in the
+  \emph{positive} direction.
+\item
+  A positive slope means we want to step to the left, or move in the
+  \emph{negative} direction.
+\end{itemize}
+
+\subsubsection{Algorithm Attempt 1}\label{algorithm-attempt-1}
+
+Armed with this knowledge, let's try to see if we can use the derivative
+to optimize the function.
+
+We start by making some guess for the minimizing value of \(x\). Then,
+we look at the derivative of the function at this value of \(x\), and
+step downhill in the \emph{opposite} direction. We can express our new
+rule as a recurrence relation:
+
+\[x^{(t+1)} = x^{(t)} - \frac{d}{dx} f(x^{(t)})\]
+
+Translating this statement into English: we obtain \textbf{our next
+guess} for the minimizing value of \(x\) at timestep \(t+1\)
+(\(x^{(t+1)}\)) by taking \textbf{our last guess} (\(x^{(t)}\)) and
+subtracting the \textbf{derivative of the function} at that point
+(\(\frac{d}{dx} f(x^{(t)})\)).
+
+A few steps are shown below, where the old step is shown as a
+transparent point, and the next step taken is the green-filled dot.
+
+Looking pretty good! We do have a problem though -- once we arrive close
+to the minimum value of the function, our guesses ``bounce'' back and
+forth past the minimum without ever reaching it.
+
+In other words, each step we take when updating our guess moves us too
+far. We can address this by decreasing the size of each step.
+
+\subsubsection{Algorithm Attempt 2}\label{algorithm-attempt-2}
+
+Let's update our algorithm to use a \textbf{learning rate} (also
+sometimes called the step size), which controls how far we move with
+each update. We represent the learning rate with \(\alpha\).
+
+\[x^{(t+1)} = x^{(t)} - \alpha \frac{d}{dx} f(x^{(t)})\]
+
+A small \(\alpha\) means that we will take small steps; a large
+\(\alpha\) means we will take large steps. When do we stop updating? We
+stop updating either after a fixed number of updates or after a
+subsequent update doesn't change much.
+
+Updating our function to use \(\alpha=0.3\), our algorithm successfully
+\textbf{converges} (settles on a solution and stops updating
+significantly, or at all) on the minimum value.
+
+\subsection{Convexity}\label{convexity}
+
+In our analysis above, we focused our attention on the global minimum of
+the loss function. You may be wondering: what about the local minimum
+that's just to the left?
+
+If we had chosen a different starting guess for \(\theta\), or a
+different value for the learning rate \(\alpha\), our algorithm may have
+gotten ``stuck'' and converged on the local minimum, rather than on the
+true optimum value of loss.
+
+If the loss function is \textbf{convex}, gradient descent is guaranteed
+to converge and find the global minimum of the objective function.
+Formally, a function \(f\) is convex if:
+\[tf(a) + (1-t)f(b) \geq f(ta + (1-t)b)\] for all \(a, b\) in the domain
+of \(f\) and \(t \in [0, 1]\).
+
+To put this into words: if you drew a line between any two points on the
+curve, all values on the curve must be \emph{on or below} the line.
+Importantly, any local minimum of a convex function is also its global
+minimum so we avoid the situation where the algorithm converges on some
+critical point that is not the minimum of the function.
+
+In summary, non-convex loss functions can cause problems with
+optimization. This means that our choice of loss function is a key
+factor in our modeling process. It turns out that MSE \emph{is} convex,
+which is a major reason why it is such a popular choice of loss
+function. Gradient descent is only guaranteed to converge (given enough
+iterations and an appropriate step size) for convex functions.
+
+\subsection{Gradient Descent in 1
+Dimension}\label{gradient-descent-in-1-dimension}
+
+\begin{quote}
+\textbf{Terminology clarification}: In past lectures, we have used
+``loss'' to refer to the error incurred on a \emph{single} datapoint. In
+applications, we usually care more about the average error across
+\emph{all} datapoints. Going forward, we will take the ``model's loss''
+to mean the model's average error across the dataset. This is sometimes
+also known as the empirical risk, cost function, or objective function.
+\[L(\theta) = R(\theta) = \frac{1}{n} \sum_{i=1}^{n} L(y, \hat{y})\]
+\end{quote}
+
+In our discussion above, we worked with some arbitrary function \(f\).
+As data scientists, we will almost always work with gradient descent in
+the context of optimizing \emph{models} -- specifically, we want to
+apply gradient descent to find the minimum of a \emph{loss function}. In
+a modeling context, our goal is to minimize a loss function by choosing
+the minimizing model \emph{parameters}.
+
+Recall our modeling workflow from the past few lectures:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Define a model with some parameters \(\theta_i\)
+\item
+  Choose a loss function
+\item
+  Select the values of \(\theta_i\) that minimize the loss function on
+  the data
+\end{enumerate}
+
+Gradient descent is a powerful technique for completing this last task.
+By applying the gradient descent algorithm, we can select values for our
+parameters \(\theta_i\) that will lead to the model having minimal loss
+on the training data.
+
+When using gradient descent in a modeling context, we:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Make guesses for the minimizing \(\theta_i\)
+\item
+  Compute the derivative of the loss function \(L\)
+\end{enumerate}
+
+We can ``translate'' our gradient descent rule from before by replacing
+\(x\) with \(\theta\) and \(f\) with \(L\):
+
+\[\theta^{(t+1)} = \theta^{(t)} - \alpha \frac{d}{d\theta} L(\theta^{(t)})\]
+
+\subsubsection{\texorpdfstring{Gradient Descent on the \texttt{tips}
+Dataset}{Gradient Descent on the tips Dataset}}\label{gradient-descent-on-the-tips-dataset}
+
+To see this in action, let's consider a case where we have a linear
+model with no offset. We want to predict the tip (y) given the price of
+a meal (x). To do this, we
+
+\begin{itemize}
+\tightlist
+\item
+  Choose a model: \(\hat{y} = \theta_1 x\),
+\item
+  Choose a loss function:
+  \(L(\theta) = MSE(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \theta_1x_i)^2\).
+\end{itemize}
+
+Let's apply our \texttt{gradient\_descent} function from before to
+optimize our model on the \texttt{tips} dataset. We will try to select
+the best parameter \(\theta_i\) to predict the \texttt{tip} \(y\) from
+the \texttt{total\_bill} \(x\).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{df }\OperatorTok{=}\NormalTok{ sns.load\_dataset(}\StringTok{"tips"}\NormalTok{)}
+\NormalTok{df.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& total\_bill & tip & sex & smoker & day & time & size \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 16.99 & 1.01 & Female & No & Sun & Dinner & 2 \\
+1 & 10.34 & 1.66 & Male & No & Sun & Dinner & 3 \\
+2 & 21.01 & 3.50 & Male & No & Sun & Dinner & 3 \\
+3 & 23.68 & 3.31 & Male & No & Sun & Dinner & 2 \\
+4 & 24.59 & 3.61 & Female & No & Sun & Dinner & 4 \\
+\end{longtable}
+
+We can visualize the value of the MSE on our dataset for different
+possible choices of \(\theta_1\). To optimize our model, we want to
+select the value of \(\theta_1\) that leads to the lowest MSE.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ plotly.graph\_objects }\ImportTok{as}\NormalTok{ go}
+
+\KeywordTok{def}\NormalTok{ derivative\_arbitrary(x):}
+    \ControlFlowTok{return}\NormalTok{ (}\DecValTok{4}\OperatorTok{*}\NormalTok{x}\OperatorTok{**}\DecValTok{3} \OperatorTok{{-}} \DecValTok{45}\OperatorTok{*}\NormalTok{x}\OperatorTok{**}\DecValTok{2} \OperatorTok{+} \DecValTok{160}\OperatorTok{*}\NormalTok{x }\OperatorTok{{-}} \DecValTok{180}\NormalTok{)}\OperatorTok{/}\DecValTok{10}
+
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ go.Figure()}
+\NormalTok{roots }\OperatorTok{=}\NormalTok{ np.array([}\FloatTok{2.3927}\NormalTok{, }\FloatTok{3.5309}\NormalTok{, }\FloatTok{5.3263}\NormalTok{])}
+
+\NormalTok{fig.add\_trace(go.Scatter(x }\OperatorTok{=}\NormalTok{ xs, y }\OperatorTok{=}\NormalTok{ arbitrary(xs), }
+\NormalTok{                         mode }\OperatorTok{=} \StringTok{"lines"}\NormalTok{, name }\OperatorTok{=} \StringTok{"f"}\NormalTok{))}
+\NormalTok{fig.add\_trace(go.Scatter(x }\OperatorTok{=}\NormalTok{ xs, y }\OperatorTok{=}\NormalTok{ derivative\_arbitrary(xs), }
+\NormalTok{                         mode }\OperatorTok{=} \StringTok{"lines"}\NormalTok{, name }\OperatorTok{=} \StringTok{"df"}\NormalTok{, line }\OperatorTok{=}\NormalTok{ \{}\StringTok{"dash"}\NormalTok{: }\StringTok{"dash"}\NormalTok{\}))}
+\NormalTok{fig.add\_trace(go.Scatter(x }\OperatorTok{=}\NormalTok{ np.array(roots), y }\OperatorTok{=} \DecValTok{0}\OperatorTok{*}\NormalTok{roots, }
+\NormalTok{                         mode }\OperatorTok{=} \StringTok{"markers"}\NormalTok{, name }\OperatorTok{=} \StringTok{"df = zero"}\NormalTok{, marker\_size }\OperatorTok{=} \DecValTok{12}\NormalTok{))}
+\NormalTok{fig.update\_layout(font\_size }\OperatorTok{=} \DecValTok{20}\NormalTok{, yaxis\_range}\OperatorTok{=}\NormalTok{[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{3}\NormalTok{])}
+\NormalTok{fig.update\_layout(autosize}\OperatorTok{=}\VariableTok{False}\NormalTok{, width}\OperatorTok{=}\DecValTok{800}\NormalTok{, height}\OperatorTok{=}\DecValTok{600}\NormalTok{)}
+\NormalTok{fig.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+To apply gradient descent, we need to compute the derivative of the loss
+function with respect to our parameter \(\theta_1\).
+
+\begin{itemize}
+\tightlist
+\item
+  Given our loss function,
+  \[L(\theta) = MSE(\theta) = \frac{1}{n} \sum_{i=1}^n (y_i - \theta_1x_i)^2\]
+\item
+  We take the derivative with respect to \(\theta_1\)
+  \[\frac{\partial}{\partial \theta_{1}} L(\theta_1^{(t)}) = \frac{-2}{n} \sum_{i=1}^n (y_i - \theta_1^{(t)} x_i) x_i\]
+\item
+  Which results in the gradient descent update rule
+  \[\theta_1^{(t+1)} = \theta_1^{(t)} - \alpha \frac{d}{d\theta}L(\theta_1^{(t)})\]
+\end{itemize}
+
+for some learning rate \(\alpha\).
+
+Implementing this in code, we can visualize the MSE loss on the
+\texttt{tips} data. \textbf{MSE is convex}, so there is one global
+minimum.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ gradient\_descent(df, initial\_guess, alpha, n):}
+    \CommentTok{"""Performs n steps of gradient descent on df using learning rate alpha starting}
+\CommentTok{       from initial\_guess. Returns a numpy array of all guesses over time."""}
+\NormalTok{    guesses }\OperatorTok{=}\NormalTok{ [initial\_guess]}
+\NormalTok{    current\_guess }\OperatorTok{=}\NormalTok{ initial\_guess}
+    \ControlFlowTok{while} \BuiltInTok{len}\NormalTok{(guesses) }\OperatorTok{\textless{}}\NormalTok{ n:}
+\NormalTok{        current\_guess }\OperatorTok{=}\NormalTok{ current\_guess }\OperatorTok{{-}}\NormalTok{ alpha }\OperatorTok{*}\NormalTok{ df(current\_guess)}
+\NormalTok{        guesses.append(current\_guess)}
+        
+    \ControlFlowTok{return}\NormalTok{ np.array(guesses)}
+
+\KeywordTok{def}\NormalTok{ mse\_single\_arg(theta\_1):}
+    \CommentTok{"""Returns the MSE on our data for the given theta1"""}
+\NormalTok{    x }\OperatorTok{=}\NormalTok{ df[}\StringTok{"total\_bill"}\NormalTok{]}
+\NormalTok{    y\_obs }\OperatorTok{=}\NormalTok{ df[}\StringTok{"tip"}\NormalTok{]}
+\NormalTok{    y\_hat }\OperatorTok{=}\NormalTok{ theta\_1 }\OperatorTok{*}\NormalTok{ x}
+    \ControlFlowTok{return}\NormalTok{ np.mean((y\_hat }\OperatorTok{{-}}\NormalTok{ y\_obs) }\OperatorTok{**} \DecValTok{2}\NormalTok{)}
+
+\KeywordTok{def}\NormalTok{ mse\_loss\_derivative\_single\_arg(theta\_1):}
+    \CommentTok{"""Returns the derivative of the MSE on our data for the given theta1"""}
+\NormalTok{    x }\OperatorTok{=}\NormalTok{ df[}\StringTok{"total\_bill"}\NormalTok{]}
+\NormalTok{    y\_obs }\OperatorTok{=}\NormalTok{ df[}\StringTok{"tip"}\NormalTok{]}
+\NormalTok{    y\_hat }\OperatorTok{=}\NormalTok{ theta\_1 }\OperatorTok{*}\NormalTok{ x}
+    
+    \ControlFlowTok{return}\NormalTok{ np.mean(}\DecValTok{2} \OperatorTok{*}\NormalTok{ (y\_hat }\OperatorTok{{-}}\NormalTok{ y\_obs) }\OperatorTok{*}\NormalTok{ x)}
+
+\NormalTok{loss\_df }\OperatorTok{=}\NormalTok{ pd.DataFrame(\{}\StringTok{"theta\_1"}\NormalTok{:np.linspace(}\OperatorTok{{-}}\FloatTok{1.5}\NormalTok{, }\DecValTok{1}\NormalTok{), }\StringTok{"MSE"}\NormalTok{:[mse\_single\_arg(theta\_1) }\ControlFlowTok{for}\NormalTok{ theta\_1 }\KeywordTok{in}\NormalTok{ np.linspace(}\OperatorTok{{-}}\FloatTok{1.5}\NormalTok{, }\DecValTok{1}\NormalTok{)]\})}
+
+\NormalTok{trajectory }\OperatorTok{=}\NormalTok{ gradient\_descent(mse\_loss\_derivative\_single\_arg, }\OperatorTok{{-}}\FloatTok{0.5}\NormalTok{, }\FloatTok{0.0001}\NormalTok{, }\DecValTok{100}\NormalTok{)}
+
+\NormalTok{plt.plot(loss\_df[}\StringTok{"theta\_1"}\NormalTok{], loss\_df[}\StringTok{"MSE"}\NormalTok{])}
+\NormalTok{plt.scatter(trajectory, [mse\_single\_arg(guess) }\ControlFlowTok{for}\NormalTok{ guess }\KeywordTok{in}\NormalTok{ trajectory], c}\OperatorTok{=}\StringTok{"white"}\NormalTok{, edgecolor}\OperatorTok{=}\StringTok{"firebrick"}\NormalTok{)}
+\NormalTok{plt.scatter(trajectory[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{], mse\_single\_arg(trajectory[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]), c}\OperatorTok{=}\StringTok{"firebrick"}\NormalTok{)}
+\NormalTok{plt.xlabel(}\VerbatimStringTok{r"$\textbackslash{}theta\_1$"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\VerbatimStringTok{r"$L(\textbackslash{}theta\_1)$"}\NormalTok{)}\OperatorTok{;}
+
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Final guess for theta\_1: }\SpecialCharTok{\{}\NormalTok{trajectory[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Final guess for theta_1: 0.14369554654231262
+\end{verbatim}
+
+\includegraphics{gradient_descent/gradient_descent_files/figure-pdf/cell-21-output-2.pdf}
+
+\subsection{Gradient Descent on Multi-Dimensional
+Models}\label{gradient-descent-on-multi-dimensional-models}
+
+The function we worked with above was one-dimensional -- we were only
+minimizing the function with respect to a single parameter, \(\theta\).
+However, models usually have a cost function with multiple parameters
+that need to be optimized. For example, simple linear regression has 2
+parameters: \[\hat{y} + \theta_0 + \theta_1x\] and multiple linear
+regression has \(p+1\) parameters:
+\[\mathbb{Y} = \theta_0 + \theta_1 \Bbb{X}_{:,1} + \theta_2 \Bbb{X}_{:,2} + \cdots + \theta_p \Bbb{X}_{:,p}\]
+
+We'll need to expand gradient descent so we can update our guesses for
+all model parameters all in one go.
+
+With multiple parameters to optimize, we consider a \textbf{loss
+surface}, or the model's loss for a particular \emph{combination} of
+possible parameter values.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ plotly.graph\_objects }\ImportTok{as}\NormalTok{ go}
+
+
+\KeywordTok{def}\NormalTok{ mse\_loss(theta, X, y\_obs):}
+\NormalTok{    y\_hat }\OperatorTok{=}\NormalTok{ X }\OperatorTok{@}\NormalTok{ theta}
+    \ControlFlowTok{return}\NormalTok{ np.mean((y\_hat }\OperatorTok{{-}}\NormalTok{ y\_obs) }\OperatorTok{**} \DecValTok{2}\NormalTok{)    }
+
+\NormalTok{tips\_with\_bias }\OperatorTok{=}\NormalTok{ df.copy()}
+\NormalTok{tips\_with\_bias[}\StringTok{"bias"}\NormalTok{] }\OperatorTok{=} \DecValTok{1}
+\NormalTok{tips\_with\_bias }\OperatorTok{=}\NormalTok{ tips\_with\_bias[[}\StringTok{"bias"}\NormalTok{, }\StringTok{"total\_bill"}\NormalTok{]]}
+
+\NormalTok{uvalues }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{10}\NormalTok{)}
+\NormalTok{vvalues }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\FloatTok{0.1}\NormalTok{, }\FloatTok{0.35}\NormalTok{, }\DecValTok{10}\NormalTok{)}
+\NormalTok{(u,v) }\OperatorTok{=}\NormalTok{ np.meshgrid(uvalues, vvalues)}
+\NormalTok{thetas }\OperatorTok{=}\NormalTok{ np.vstack((u.flatten(),v.flatten()))}
+
+\KeywordTok{def}\NormalTok{ mse\_loss\_single\_arg(theta):}
+    \ControlFlowTok{return}\NormalTok{ mse\_loss(theta, tips\_with\_bias, df[}\StringTok{"tip"}\NormalTok{])}
+
+\NormalTok{MSE }\OperatorTok{=}\NormalTok{ np.array([mse\_loss\_single\_arg(t) }\ControlFlowTok{for}\NormalTok{ t }\KeywordTok{in}\NormalTok{ thetas.T])}
+
+\NormalTok{loss\_surface }\OperatorTok{=}\NormalTok{ go.Surface(x}\OperatorTok{=}\NormalTok{u, y}\OperatorTok{=}\NormalTok{v, z}\OperatorTok{=}\NormalTok{np.reshape(MSE, u.shape))}
+
+\NormalTok{ind }\OperatorTok{=}\NormalTok{ np.argmin(MSE)}
+\NormalTok{optimal\_point }\OperatorTok{=}\NormalTok{ go.Scatter3d(name }\OperatorTok{=} \StringTok{"Optimal Point"}\NormalTok{,}
+\NormalTok{    x }\OperatorTok{=}\NormalTok{ [thetas.T[ind,}\DecValTok{0}\NormalTok{]], y }\OperatorTok{=}\NormalTok{ [thetas.T[ind,}\DecValTok{1}\NormalTok{]], }
+\NormalTok{    z }\OperatorTok{=}\NormalTok{ [MSE[ind]],}
+\NormalTok{    marker}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(size}\OperatorTok{=}\DecValTok{10}\NormalTok{, color}\OperatorTok{=}\StringTok{"red"}\NormalTok{))}
+
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ go.Figure(data}\OperatorTok{=}\NormalTok{[loss\_surface, optimal\_point])}
+\NormalTok{fig.update\_layout(scene }\OperatorTok{=} \BuiltInTok{dict}\NormalTok{(}
+\NormalTok{    xaxis\_title }\OperatorTok{=} \StringTok{"theta0"}\NormalTok{,}
+\NormalTok{    yaxis\_title }\OperatorTok{=} \StringTok{"theta1"}\NormalTok{,}
+\NormalTok{    zaxis\_title }\OperatorTok{=} \StringTok{"MSE"}\NormalTok{), autosize}\OperatorTok{=}\VariableTok{False}\NormalTok{, width}\OperatorTok{=}\DecValTok{800}\NormalTok{, height}\OperatorTok{=}\DecValTok{600}\NormalTok{)}
+
+\NormalTok{fig.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+We can also visualize a bird's-eye view of the loss surface from above
+using a contour plot:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{contour }\OperatorTok{=}\NormalTok{ go.Contour(x}\OperatorTok{=}\NormalTok{u[}\DecValTok{0}\NormalTok{], y}\OperatorTok{=}\NormalTok{v[:, }\DecValTok{0}\NormalTok{], z}\OperatorTok{=}\NormalTok{np.reshape(MSE, u.shape))}
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ go.Figure(contour)}
+\NormalTok{fig.update\_layout(}
+\NormalTok{    xaxis\_title }\OperatorTok{=} \StringTok{"theta0"}\NormalTok{,}
+\NormalTok{    yaxis\_title }\OperatorTok{=} \StringTok{"theta1"}\NormalTok{, autosize}\OperatorTok{=}\VariableTok{False}\NormalTok{, width}\OperatorTok{=}\DecValTok{800}\NormalTok{, height}\OperatorTok{=}\DecValTok{600}\NormalTok{)}
+
+\NormalTok{fig.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+\subsubsection{The Gradient Vector}\label{the-gradient-vector}
+
+As before, the derivative of the loss function tells us the best way
+towards the minimum value.
+
+On a 2D (or higher) surface, the best way to go down (gradient) is
+described by a \emph{vector}.
+
+\begin{quote}
+Math Aside: Partial Derivatives
+\end{quote}
+
+\begin{quote}
+\begin{itemize}
+\tightlist
+\item
+  For an equation with multiple variables, we take a \textbf{partial
+  derivative} by differentiating with respect to just one variable at a
+  time. The partial derivative is denoted with a \(\partial\).
+  Intuitively, we want to see how the function changes if we only vary
+  one variable while holding other variables constant.
+\item
+  Using \(f(x, y) = 3x^2 + y\) as an example,
+
+  \begin{itemize}
+  \tightlist
+  \item
+    taking the partial derivative with respect to x and treating y as a
+    constant gives us \(\frac{\partial f}{\partial x} = 6x\)
+  \item
+    taking the partial derivative with respect to y and treating x as a
+    constant gives us \(\frac{\partial f}{\partial y} = 1\)
+  \end{itemize}
+\end{itemize}
+\end{quote}
+
+For the \emph{vector} of parameter values
+\(\vec{\theta} = \begin{bmatrix}
+           \theta_{0} \\
+           \theta_{1} \\
+         \end{bmatrix}\), we take the \emph{partial derivative} of loss
+with respect to each parameter: \(\frac{\partial L}{\partial \theta_0}\)
+and \(\frac{\partial L}{\partial \theta_1}\).
+
+\begin{quote}
+For example, consider the 2D function:
+\[f(\theta_0, \theta_1) = 8 \theta_0^2 + 3\theta_0\theta_1\] For a
+function of 2 variables \(f(\theta_0, \theta_1)\), we define the
+gradient \[
+\begin{align}
+\frac{\partial f}{\partial \theta_{0}} &= 16\theta_0 + 3\theta_1 \\
+\frac{\partial f}{\partial \theta_{1}} &= 3\theta_0 \\
+\nabla_{\vec{\theta}} f(\vec{\theta}) &=  \begin{bmatrix} 16\theta_0 + 3\theta_1 \\ 3\theta_0 \\ \end{bmatrix}
+\end{align}
+\]
+\end{quote}
+
+The \textbf{gradient vector} of a generic function of \(p+1\) variables
+is therefore
+\[\nabla_{\vec{\theta}} L =  \begin{bmatrix} \frac{\partial L}{\partial \theta_0} \\ \frac{\partial L}{\partial \theta_1} \\ \vdots \end{bmatrix}\]
+where \(\nabla_\theta L\) always points in the downhill direction of the
+surface. We can interpret each gradient as: ``If I nudge the \(i\)th
+model weight, what happens to loss?''
+
+We can use this to update our 1D gradient rule for models with multiple
+parameters.
+
+\begin{itemize}
+\item
+  Recall our 1D update rule:
+  \[\theta^{(t+1)} = \theta^{(t)} - \alpha \frac{d}{d\theta}L(\theta^{(t)})\]
+\item
+  For models with multiple parameters, we work in terms of vectors:
+  \[\begin{bmatrix}
+           \theta_{0}^{(t+1)} \\
+           \theta_{1}^{(t+1)} \\
+           \vdots
+         \end{bmatrix} = \begin{bmatrix}
+           \theta_{0}^{(t)} \\
+           \theta_{1}^{(t)} \\
+           \vdots
+         \end{bmatrix} - \alpha \begin{bmatrix}
+           \frac{\partial L}{\partial \theta_{0}} \\
+           \frac{\partial L}{\partial \theta_{1}} \\
+           \vdots \\
+         \end{bmatrix}\]
+\item
+  Written in a more compact form,
+  \[\vec{\theta}^{(t+1)} = \vec{\theta}^{(t)} - \alpha \nabla_{\vec{\theta}} L(\theta^{(t)}) \]
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \(\theta\) is a vector with our model weights
+  \item
+    \(L\) is the loss function
+  \item
+    \(\alpha\) is the learning rate (ours is constant, but other
+    techniques use an \(\alpha\) that decreases over time)
+  \item
+    \(\vec{\theta}^{(t)}\) is the current value of \(\theta\)
+  \item
+    \(\vec{\theta}^{(t+1)}\) is the next value of \(\theta\)
+  \item
+    \(\nabla_{\vec{\theta}} L(\theta^{(t)})\) is the gradient of the
+    loss function evaluated at the current \(\vec{\theta}^{(t)}\)
+  \end{itemize}
+\end{itemize}
+
+\subsection{Batch Gradient Descent and Stochastic Gradient
+Descent}\label{batch-gradient-descent-and-stochastic-gradient-descent}
+
+Formally, the algorithm we derived above is called \textbf{batch
+gradient descent.} For each iteration of the algorithm, the derivative
+of loss is computed across the \emph{entire} batch of all \(n\)
+datapoints. While this update rule works well in theory, it is not
+practical in most circumstances. For large datasets (with perhaps
+billions of datapoints), finding the gradient across all the data is
+incredibly computationally taxing; gradient descent will converge slowly
+because each individual update is slow.
+
+\textbf{Stochastic (mini-batch) gradient descent} tries to address this
+issue. In stochastic descent, only a \emph{sample} of the full dataset
+is used at each update. We estimate the true gradient of the loss
+surface using just that sample of data. The \textbf{batch size} is the
+number of data points used in each sample. The sampling strategy is
+generally without replacement (data is shuffled and batch size examples
+are selected one at a time.)
+
+Each complete ``pass'' through the data is known as a \textbf{training
+epoch}. After shuffling the data, in a single \textbf{training epoch} of
+stochastic gradient descent, we
+
+\begin{itemize}
+\tightlist
+\item
+  Compute the gradient on the first x\% of the data. Update the
+  parameter guesses.
+\item
+  Compute the gradient on the next x\% of the data. Update the parameter
+  guesses.
+\item
+  \(\dots\)
+\item
+  Compute the gradient on the last x\% of the data. Update the parameter
+  guesses.
+\end{itemize}
+
+Every data point appears once in a single training epoch. We then
+perform several training epochs until we're satisfied.
+
+Batch gradient descent is a deterministic technique -- because the
+entire dataset is used at each update iteration, the algorithm will
+always advance towards the minimum of the loss surface. In contrast,
+stochastic gradient descent involve an element of randomness. Since only
+a subset of the full data is used to update the guess for
+\(\vec{\theta}\) at each iteration, there's a chance the algorithm will
+not progress towards the true minimum of loss with each update. Over the
+longer term, these stochastic techniques should still converge towards
+the optimal solution.
+
+The diagrams below represent a ``bird's eye view'' of a loss surface
+from above. Notice that batch gradient descent takes a direct path
+towards the optimal \(\hat{\theta}\). Stochastic gradient descent, in
+contrast, ``hops around'' on its path to the minimum point on the loss
+surface. This reflects the randomness of the sampling process at each
+update step.
+
+To summarize the tradeoffs of batch size:
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3333}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3333}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3333}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+-
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Smaller Batch Size
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Larger Batch Size
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Pros & More frequent gradient updates & Leverage hardware acceleration
+to improve overall system performance and higher quality gradient
+updates \\
+Cons & More variability in the gradient estimates & Less frequent
+gradient updates \\
+\end{longtable}
+
+The typical solution is to set batch size to ensure sufficient hardware
+utilization.
+
+\bookmarksetup{startatroot}
+
+\chapter{Feature Engineering}\label{feature-engineering}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Recognize the value of feature engineering as a tool to improve model
+  performance
+\item
+  Implement polynomial feature generation and one hot encoding
+\item
+  Understand the interactions between model complexity, model variance,
+  and training error
+\end{itemize}
+
+\end{tcolorbox}
+
+At this point, we've grown quite familiar with the modeling process.
+We've introduced the concept of loss, used it to fit several types of
+models, and, most recently, extended our analysis to multiple
+regression. Along the way, we've forged our way through the mathematics
+of deriving the optimal model parameters in all its gory detail. It's
+time to make our lives a little easier -- let's implement the modeling
+process in code!
+
+In this lecture, we'll explore two techniques for model fitting:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Translating our derived formulas for regression to \texttt{python}
+\item
+  Using \texttt{python}'s \texttt{sklearn} package
+\end{enumerate}
+
+With our new programming frameworks in hand, we will also add
+sophistication to our models by introducing more complex features to
+enhance model performance.
+
+\section{Gradient Descent Cont.}\label{gradient-descent-cont.}
+
+Before we dive into feature engineering, let's quickly review gradient
+descent, which we covered in the last lecture. Recall that gradient
+descent is a powerful technique for choosing the model parameters that
+minimize the loss function.
+
+\subsection{Gradient Descent Review}\label{gradient-descent-review}
+
+As we learned earlier, we set the derivative of the loss function to
+zero and solve to determine the optimal parameters \(\theta\) that
+minimize loss. For a loss surface in 2D (or higher), the best way to
+minimize loss is to ``walk'' down the loss surface until we reach our
+optimal parameters \(\vec{\theta}\). The \textbf{gradient vector} tells
+us which direction to ``walk'' in.
+
+For example, the \emph{vector} of parameter values
+\(\vec{\theta} = \begin{bmatrix}
+           \theta_{0} \\
+           \theta_{1} \\
+         \end{bmatrix}\) gives us a two parameter model (d = 2). To
+calculate our gradient vector, we can take the \emph{partial derivative}
+of loss with respect to each parameter:
+\(\frac{\partial L}{\partial \theta_0}\) and
+\(\frac{\partial L}{\partial \theta_1}\).
+
+Its \textbf{gradient vector} would then be the 2D vector:
+\[\nabla_{\vec{\theta}} L =  \begin{bmatrix} \frac{\partial L}{\partial \theta_0} \\ \frac{\partial L}{\partial \theta_1} \end{bmatrix}\]
+
+Note that \(-\nabla_{\vec{\theta}} L\) always points in the
+\textbf{downhill direction} of the surface.
+
+Recall that we also discussed the gradient descent update rule, where we
+nudge \(\theta\) in a negative gradient direction until \(\theta\)
+converges.
+
+As a refresher, the rule is as follows:
+\[\vec{\theta}^{(t+1)} = \vec{\theta}^{(t)} - \alpha \nabla_{\vec{\theta}} L(\vec{\theta}^{(t)}) \]
+
+\begin{itemize}
+\tightlist
+\item
+  \(\theta\) is a vector with our model weights
+\item
+  \(L\) is the loss function
+\item
+  \(\alpha\) is the learning rate
+\item
+  \(\vec{\theta}^{(t)}\) is the current value of \(\theta\)
+\item
+  \(\vec{\theta}^{(t+1)}\) is the next value of \(\theta\)
+\item
+  \(\nabla_{\vec{\theta}} L(\vec{\theta}^{(t)})\) is the gradient of the
+  loss function evaluated at the current \(\theta\):
+  \[\frac{1}{n}\sum_{i=1}^{n}\nabla_{\vec{\theta}} l(y_i, f_{\vec{\theta}^{(t)}}(X_i))\]
+\end{itemize}
+
+Let's now walk through an example of calculating and updating the
+gradient vector. Say our model and loss are: \[\begin{align}
+f_{\vec{\theta}}(\vec{x}) &= \vec{x}^T\vec{\theta} = \theta_0x_0 + \theta_1x_1
+\\l(y, \hat{y}) &= (y - \hat{y})^2
+\end{align}
+\]
+
+Plugging in \(f_{\vec{\theta}}(\vec{x})\) for \(\hat{y}\), our loss
+function becomes
+\(l(\vec{\theta}, \vec{x}, y_i) = (y_i - \theta_0x_0 - \theta_1x_1)^2\).
+
+To calculate our gradient vector, we can start by computing the partial
+derivative of the loss function with respect to \(\theta_0\):
+\[\frac{\partial}{\partial \theta_{0}} l(\vec{\theta}, \vec{x}, y_i) = 2(y_i - \theta_0x_0 - \theta_1x_1)(-x_0)\]
+
+Let's now do the same but with respect to \(\theta_1\):
+\[\frac{\partial}{\partial \theta_{1}} l(\vec{\theta}, \vec{x}, y_i) = 2(y_i - \theta_0x_0 - \theta_1x_1)(-x_1)\]
+
+Putting this together, our gradient vector is:
+\[\nabla_{\theta} l(\vec{\theta}, \vec{x}, y_i) =  \begin{bmatrix} -2(y_i - \theta_0x_0 - \theta_1x_1)(x_0) \\ -2(y_i - \theta_0x_0 - \theta_1x_1)(x_1) \end{bmatrix}\]
+
+Remember that we need to keep updating \(\theta\) until the algorithm
+\textbf{converges} to a solution and stops updating significantly (or at
+all). When updating \(\theta\), we'll have a fixed number of updates and
+subsequent updates will be quite small (we won't change \(\theta\) by
+much).
+
+\subsection{Stochastic (Mini-batch) Gradient
+Descent}\label{stochastic-mini-batch-gradient-descent}
+
+Let's now dive deeper into gradient and stochastic gradient descent. In
+the previous lecture, we discussed how finding the gradient across all
+the data is extremeley computationally taxing and takes a lot of
+resources to calculate.
+
+We know that the solution to the normal equation is
+\(\hat{\theta} = (\mathbb{X}^T\mathbb{X})^{-1}\mathbb{X}^T\mathbb{Y}\).
+Let's break this down and determine the computational complexity for
+this solution.
+
+Let \(n\) be the number of samples (rows) and \(d\) be the number of
+features (columns).
+
+\begin{itemize}
+\tightlist
+\item
+  Computing \((\mathbb{X}^{\top}\mathbb{X})\) takes \(O(nd^2)\) time,
+  and it's inverse takes another \(O(d^3)\) time to calculate; overall,
+  \((\mathbb{X}^{\top}\mathbb{X})^{-1}\) takes \(O(nd^2) + O(d^3)\)
+  time.
+\item
+  \(\mathbb{X}^{\top}\mathbb{Y}\) takes \(O(nd)\) time.
+\item
+  Multiplying \((\mathbb{X}^{\top}\mathbb{X})^{-1}\) and
+  \(\mathbb{X}^{\top}\mathbb{Y}\) takes \(O(d^2)\) time.
+\end{itemize}
+
+In total, calculating the solution to the normal equation takes
+\(O(nd^2) + O(d^3) + O(nd) + O(d^2)\) time. We can see that
+\(O(nd^2) + O(d^3)\) dominates the complexity --- this can be
+problematic for high-dimensional models and very large datasets.
+
+On the other hand, the time complexity of a single gradient descent step
+takes only \(O(nd)\) time.
+
+Suppose we run \(T\) iterations. The final complexity would then be
+\(O(Tnd)\). Typically, \(n\) is much larger than \(T\) and \(d\). How
+can we reduce the cost of this algorithm using a technique from Data
+100? Do we really need to use \(n\) data points? We don't! Instead, we
+can use stochastic gradient descent.
+
+We know that our true gradient of
+\(\nabla_{\vec{\theta}} L (\vec{\theta^{(t)}}) = \frac{1}{n}\sum_{i=1}^{n}\nabla_{\vec{\theta}} l(y_i, f_{\vec{\theta}^{(t)}}(X_i))\)
+has a time complexity of \(O(nd)\). Instead of using all \(n\) samples
+to calculate the true gradient of the loss surface, let's use a sample
+of our data to approximate. Say we sample \(b\) records
+(\(s_1, \cdots, s_b\)) from our \(n\) datapoints. Our new (stochastic)
+gradient descent function will be
+\(\nabla_{\vec{\theta}} L (\vec{\theta^{(t)}}) = \frac{1}{b}\sum_{i=1}^{b}\nabla_{\vec{\theta}} l(y_{s_i}, f_{\vec{\theta}^{(t)}}(X_{s_i}))\)
+and will now have a time complexity of \(O(bd)\), which is much faster!
+
+Stochastic gradient descent helps us approximate the gradient while also
+reducing the time complexity and computational cost. The time complexity
+scales with the number of datapoints selected in the sample. To sample
+data, there are two approaches we can use:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Shuffle the data and select samples one at a time.
+\item
+  Take a simple random sample for each gradient computation.
+\end{enumerate}
+
+But how do we decide our mini-batch size (\(b\)), or the number of
+datapoints in our sample? The original stochastic gradient descent
+algorithm uses \(b=1\) so that only one sample is used to approximate
+the gradient at a time. Although we don't use such a small mini-batch
+size often, \(b\) typically is small. When choosing \(b\), there are
+several factors to consider: a larger batch size results in a better
+gradient estimate, parallelism, and other systems factors. On the other
+hand, a smaller batch size will be faster and have more frequent
+updates. It is up to data scientists to balance the tradeoff between
+batch size and time complexity.
+
+Summarizing our two gradient descent techniques:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{(Batch) Gradient Descent}: Gradient descent computes the
+  \textbf{true} descent and always descends towards the true minimum of
+  the loss. While accurate, it can often be computationally expensive.
+\end{itemize}
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{(Minibatch) Stochastic gradient descent}: Stochastic gradient
+  descent \textbf{approximates} the true gradient descent. It may not
+  descend towards the true minimum with each update, but it's often less
+  computationally expensive than batch gradient descent.
+\end{itemize}
+
+\section{Feature Engineering}\label{feature-engineering-1}
+
+At this point in the course, we've equipped ourselves with some powerful
+techniques to build and optimize models. We've explored how to develop
+models of multiple variables, as well as how to transform variables to
+help \textbf{linearize} a dataset and fit these models to maximize their
+performance.
+
+All of this was done with one major caveat: the regression models we've
+worked with so far are all \textbf{linear in the input variables}. We've
+assumed that our predictions should be some combination of linear
+variables. While this works well in some cases, the real world isn't
+always so straightforward. We'll learn an important method to address
+this issue -- feature engineering -- and consider some new problems that
+can arise when we do so.
+
+Feature engineering is the process of \emph{transforming} raw features
+into \emph{more informative features} that can be used in modeling or
+EDA tasks and improve model performance.
+
+Feature engineering allows you to:
+
+\begin{itemize}
+\tightlist
+\item
+  Capture domain knowledge
+\item
+  Express non-linear relationships using linear models
+\item
+  Use non-numeric (qualitative) features in models
+\end{itemize}
+
+\section{Feature Functions}\label{feature-functions}
+
+A \textbf{feature function} describes the transformations we apply to
+raw features in a dataset to create a design matrix of transformed
+features. We typically denote the feature function as \(\Phi\) (the
+Greek letter ``phi'' that we use to represent the true function). When
+we apply the feature function to our original dataset \(\mathbb{X}\),
+the result, \(\Phi(\mathbb{X})\), is a transformed design matrix ready
+to be used in modeling.
+
+For example, we might design a feature function that computes the square
+of an existing feature and adds it to the design matrix. In this case,
+our existing matrix \([x]\) is transformed to \([x, x^2]\). Its
+\emph{dimension} increases from 1 to 2. Often, the dimension of the
+\emph{featurized} dataset increases as seen here.
+
+The new features introduced by the feature function can then be used in
+modeling. Often, we use the symbol \(\phi_i\) to represent transformed
+features after feature engineering.
+
+\[
+\begin{align}
+\hat{y} &= \theta_0 + \theta_1 x + \theta_2 x^2 \\
+\hat{y} &= \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2
+\end{align}
+\]
+
+In matrix notation, the symbol \(\Phi\) is sometimes used to denote the
+design matrix after feature engineering has been performed. Note that in
+the usage below, \(\Phi\) is now a feature-engineered matrix, rather
+than a function.
+
+\[\hat{\mathbb{Y}} = \Phi \theta\]
+
+More formally, we describe a feature function as transforming the
+original \(\mathbb{R}^{n \times p}\) dataset \(\mathbb{X}\) to a
+featurized \(\mathbb{R}^{n \times p'}\) dataset \(\mathbb{\Phi}\), where
+\(p'\) is typically greater than \(p\).
+
+\[\mathbb{X} \in \mathbb{R}^{n \times p} \longrightarrow \Phi \in \mathbb{R}^{n \times p'}\]
+
+\section{One Hot Encoding}\label{one-hot-encoding}
+
+Feature engineering opens up a whole new set of possibilities for
+designing better-performing models. As you will see in lab and homework,
+feature engineering is one of the most important parts of the entire
+modeling process.
+
+A particularly powerful use of feature engineering is to allow us to
+perform regression on \emph{non-numeric} features. \textbf{One hot
+encoding} is a feature engineering technique that generates numeric
+features from categorical data, allowing us to use our usual methods to
+fit a regression model on the data.
+
+To illustrate how this works, we'll refer back to the \texttt{tips}
+dataset from previous lectures. Consider the \texttt{"day"} column of
+the dataset:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ sklearn.linear\_model }\ImportTok{as}\NormalTok{ lm}
+\NormalTok{tips }\OperatorTok{=}\NormalTok{ sns.load\_dataset(}\StringTok{"tips"}\NormalTok{)}
+\NormalTok{tips.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllll@{}}
+\toprule\noalign{}
+& total\_bill & tip & sex & smoker & day & time & size \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 16.99 & 1.01 & Female & No & Sun & Dinner & 2 \\
+1 & 10.34 & 1.66 & Male & No & Sun & Dinner & 3 \\
+2 & 21.01 & 3.50 & Male & No & Sun & Dinner & 3 \\
+3 & 23.68 & 3.31 & Male & No & Sun & Dinner & 2 \\
+4 & 24.59 & 3.61 & Female & No & Sun & Dinner & 4 \\
+\end{longtable}
+
+At first glance, it doesn't seem possible to fit a regression model to
+this data -- we can't directly perform any mathematical operations on
+the entry ``Sun''.
+
+To resolve this, we instead create a new table with a feature for each
+unique value in the original \texttt{"day"} column. We then iterate
+through the \texttt{"day"} column. For each entry in \texttt{"day"} we
+fill the corresponding feature in the new table with 1. All other
+features are set to 0.
+
+In short, each category of a categorical variable gets its own feature
+
+Value = 1 if a row belongs to the category
+
+Value = 0 otherwise
+
+The \texttt{OneHotEncoder} class of \texttt{sklearn}
+(\href{https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html\#sklearn.preprocessing.OneHotEncoder.get_feature_names_out}{documentation})
+offers a quick way to perform this one-hot encoding. You will explore
+its use in detail in the lab. For now, recognize that we follow a very
+similar workflow to when we were working with the
+\texttt{LinearRegression} class: we initialize a \texttt{OneHotEncoder}
+object, fit it to our data, and finally use \texttt{.transform} to apply
+the fitted encoder.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ sklearn.preprocessing }\ImportTok{import}\NormalTok{ OneHotEncoder}
+
+\CommentTok{\# Initialize a OneHotEncoder object}
+\NormalTok{ohe }\OperatorTok{=}\NormalTok{ OneHotEncoder()}
+
+\CommentTok{\# Fit the encoder}
+\NormalTok{ohe.fit(tips[[}\StringTok{"day"}\NormalTok{]])}
+
+\CommentTok{\# Use the encoder to transform the raw "day" feature}
+\NormalTok{encoded\_day }\OperatorTok{=}\NormalTok{ ohe.transform(tips[[}\StringTok{"day"}\NormalTok{]]).toarray()}
+\NormalTok{encoded\_day\_df }\OperatorTok{=}\NormalTok{ pd.DataFrame(encoded\_day, columns}\OperatorTok{=}\NormalTok{ohe.get\_feature\_names\_out())}
+
+\NormalTok{encoded\_day\_df.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& day\_Fri & day\_Sat & day\_Sun & day\_Thur \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 0.0 & 0.0 & 1.0 & 0.0 \\
+1 & 0.0 & 0.0 & 1.0 & 0.0 \\
+2 & 0.0 & 0.0 & 1.0 & 0.0 \\
+3 & 0.0 & 0.0 & 1.0 & 0.0 \\
+4 & 0.0 & 0.0 & 1.0 & 0.0 \\
+\end{longtable}
+
+The one-hot encoded features can then be used in the design matrix to
+train a model:
+
+\[\hat{y} = \theta_1 (\text{total}\_\text{bill}) + \theta_2 (\text{size}) + \theta_3 (\text{day}\_\text{Fri}) + \theta_4 (\text{day}\_\text{Sat}) + \theta_5 (\text{day}\_\text{Sun}) + \theta_6 (\text{day}\_\text{Thur})\]
+
+Or in shorthand:
+
+\[\hat{y} = \theta_{1}\phi_{1} + \theta_{2}\phi_{2} + \theta_{3}\phi_{3} + \theta_{4}\phi_{4} + \theta_{5}\phi_{5} + \theta_{6}\phi_{6}\]
+
+Now, the \texttt{day} feature (or rather, the four new boolean features
+that represent day) can be used to fit a model.
+
+Using \texttt{sklearn} to fit the new model, we can determine the model
+coefficients, allowing us to understand how each feature impacts the
+predicted tip.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ sklearn.linear\_model }\ImportTok{import}\NormalTok{ LinearRegression}
+\NormalTok{data\_w\_ohe }\OperatorTok{=}\NormalTok{ tips[[}\StringTok{"total\_bill"}\NormalTok{, }\StringTok{"size"}\NormalTok{, }\StringTok{"day"}\NormalTok{]].join(encoded\_day\_df).drop(columns }\OperatorTok{=} \StringTok{"day"}\NormalTok{)}
+\NormalTok{ohe\_model }\OperatorTok{=}\NormalTok{ lm.LinearRegression(fit\_intercept}\OperatorTok{=}\VariableTok{False}\NormalTok{) }\CommentTok{\#Tell sklearn to not add an additional bias column. Why?}
+\NormalTok{ohe\_model.fit(data\_w\_ohe, tips[}\StringTok{"tip"}\NormalTok{])}
+
+\NormalTok{pd.DataFrame(\{}\StringTok{"Feature"}\NormalTok{:data\_w\_ohe.columns, }\StringTok{"Model Coefficient"}\NormalTok{:ohe\_model.coef\_\})}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Feature & Model Coefficient \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & total\_bill & 0.092994 \\
+1 & size & 0.187132 \\
+2 & day\_Fri & 0.745787 \\
+3 & day\_Sat & 0.621129 \\
+4 & day\_Sun & 0.732289 \\
+5 & day\_Thur & 0.668294 \\
+\end{longtable}
+
+For example, when looking at the coefficient for \texttt{day\_Fri}, we
+can now understand the impact of it being Friday on the predicted tip
+--- if it is a Friday, the predicted tip increases by approximately
+\$0.75.
+
+When one-hot encoding, keep in mind that any set of one-hot encoded
+columns will always sum to a column of all ones, representing the bias
+column. More formally, the bias column is a linear combination of the
+OHE columns.
+
+We must be careful not to include this bias column in our design matrix.
+Otherwise, there will be linear dependence in the model, meaning
+\(\mathbb{X}^{\top}\mathbb{X}\) would no longer be invertible, and our
+OLS estimate
+\(\hat{\theta} = (\mathbb{X}^{\top}\mathbb{X})^{-1}\mathbb{X}^{\top}\mathbb{Y}\)
+fails.
+
+To resolve this issue, we simply omit one of the one-hot encoded columns
+\emph{or} do not include an intercept term. The adjusted design matrices
+are shown below.
+
+Either approach works --- we still retain the same information as the
+omitted column being a linear combination of the remaining columns.
+
+\section{Polynomial Features}\label{polynomial-features}
+
+We have encountered a few cases now where models with linear features
+have performed poorly on datasets that show clear non-linear curvature.
+
+As an example, consider the \texttt{vehicles} dataset, which contains
+information about cars. Suppose we want to use the \texttt{hp}
+(horsepower) of a car to predict its \texttt{"mpg"} (gas mileage in
+miles per gallon). If we visualize the relationship between these two
+variables, we see a non-linear curvature. Fitting a linear model to
+these variables results in a high (poor) value of RMSE.
+
+\[\hat{y} = \theta_0 + \theta_1 (\text{hp})\]
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.options.mode.chained\_assignment }\OperatorTok{=} \VariableTok{None} 
+\NormalTok{vehicles }\OperatorTok{=}\NormalTok{ sns.load\_dataset(}\StringTok{"mpg"}\NormalTok{).dropna().rename(columns }\OperatorTok{=}\NormalTok{ \{}\StringTok{"horsepower"}\NormalTok{: }\StringTok{"hp"}\NormalTok{\}).sort\_values(}\StringTok{"hp"}\NormalTok{)}
+
+\NormalTok{X }\OperatorTok{=}\NormalTok{ vehicles[[}\StringTok{"hp"}\NormalTok{]]}
+\NormalTok{Y }\OperatorTok{=}\NormalTok{ vehicles[}\StringTok{"mpg"}\NormalTok{]}
+
+\NormalTok{hp\_model }\OperatorTok{=}\NormalTok{ lm.LinearRegression()}
+\NormalTok{hp\_model.fit(X, Y)}
+\NormalTok{hp\_model\_predictions }\OperatorTok{=}\NormalTok{ hp\_model.predict(X)}
+
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+
+\NormalTok{sns.scatterplot(data}\OperatorTok{=}\NormalTok{vehicles, x}\OperatorTok{=}\StringTok{"hp"}\NormalTok{, y}\OperatorTok{=}\StringTok{"mpg"}\NormalTok{)}
+\NormalTok{plt.plot(vehicles[}\StringTok{"hp"}\NormalTok{], hp\_model\_predictions, c}\OperatorTok{=}\StringTok{"tab:red"}\NormalTok{)}\OperatorTok{;}
+
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"MSE of model with (hp) feature: }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{mean((Y}\OperatorTok{{-}}\NormalTok{hp\_model\_predictions)}\OperatorTok{**}\DecValTok{2}\NormalTok{)}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+MSE of model with (hp) feature: 23.943662938603108
+\end{verbatim}
+
+\includegraphics{feature_engineering/feature_engineering_files/figure-pdf/cell-5-output-2.pdf}
+
+As we can see from the plot, the data follows a curved line rather than
+a straight one. To capture this non-linearity, we can incorporate
+\textbf{non-linear} features. Let's introduce a \textbf{polynomial}
+term, \(\text{hp}^2\), into our regression model. The model now takes
+the form:
+
+\[\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)\]
+\[\hat{y} = \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2\]
+
+How can we fit a model with non-linear features? We can use the exact
+same techniques as before: ordinary least squares, gradient descent, or
+\texttt{sklearn}. This is because our new model is still a
+\textbf{linear model}. Although it contains non-linear \emph{features},
+it is linear with respect to the model \emph{parameters}. All of our
+previous work on fitting models was done under the assumption that we
+were working with linear models. Because our new model is still linear,
+we can apply our existing methods to determine the optimal parameters.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Add a hp\^{}2 feature to the design matrix}
+\NormalTok{X }\OperatorTok{=}\NormalTok{ vehicles[[}\StringTok{"hp"}\NormalTok{]]}
+\NormalTok{X[}\StringTok{"hp\^{}2"}\NormalTok{] }\OperatorTok{=}\NormalTok{ vehicles[}\StringTok{"hp"}\NormalTok{]}\OperatorTok{**}\DecValTok{2}
+
+\CommentTok{\# Use sklearn to fit the model}
+\NormalTok{hp2\_model }\OperatorTok{=}\NormalTok{ lm.LinearRegression()}
+\NormalTok{hp2\_model.fit(X, Y)}
+\NormalTok{hp2\_model\_predictions }\OperatorTok{=}\NormalTok{ hp2\_model.predict(X)}
+
+\NormalTok{sns.scatterplot(data}\OperatorTok{=}\NormalTok{vehicles, x}\OperatorTok{=}\StringTok{"hp"}\NormalTok{, y}\OperatorTok{=}\StringTok{"mpg"}\NormalTok{)}
+\NormalTok{plt.plot(vehicles[}\StringTok{"hp"}\NormalTok{], hp2\_model\_predictions, c}\OperatorTok{=}\StringTok{"tab:red"}\NormalTok{)}\OperatorTok{;}
+
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"MSE of model with (hp\^{}2) feature: }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{mean((Y}\OperatorTok{{-}}\NormalTok{hp2\_model\_predictions)}\OperatorTok{**}\DecValTok{2}\NormalTok{)}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+MSE of model with (hp^2) feature: 18.98476890761722
+\end{verbatim}
+
+\includegraphics{feature_engineering/feature_engineering_files/figure-pdf/cell-6-output-2.pdf}
+
+Looking a lot better! By incorporating a squared feature, we are able to
+capture the curvature of the dataset. Our model is now a parabola
+centered on our data. Notice that our new model's error has decreased
+relative to the original model with linear features.
+
+\section{Complexity and Overfitting}\label{complexity-and-overfitting}
+
+We've seen now that feature engineering allows us to build all sorts of
+features to improve the performance of the model. In particular, we saw
+that designing a more complex feature (squaring \texttt{hp} in the
+\texttt{vehicles} data previously) substantially improved the model's
+ability to capture non-linear relationships. To take full advantage of
+this, we might be inclined to design increasingly complex features.
+Consider the following three models, each of different order (the
+maximum exponent power of each model):
+
+\begin{itemize}
+\tightlist
+\item
+  Model with order 2:
+  \(\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)\)
+\item
+  Model with order 3:
+  \(\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3)\)
+\item
+  Model with order 4:
+  \(\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3) + \theta_4 (\text{hp}^4)\)
+\end{itemize}
+
+As we can see in the plots above, MSE continues to decrease with each
+additional polynomial term. To visualize it further, let's plot models
+as the complexity increases from 0 to 7:
+
+When we use our model to make predictions on the same data that was used
+to fit the model, we find that the MSE decreases with each additional
+polynomial term (as our model gets more complex). The \textbf{training
+error} is the model's error when generating predictions from the same
+data that was used for training purposes. We can conclude that the
+training error goes down as the complexity of the model increases.
+
+This seems like good news -- when working on the \textbf{training data},
+we can improve model performance by designing increasingly complex
+models.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Math Fact: Polynomial Degrees}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+Given \(N\) overlapping data points, we can always find a polynomial of
+degree \(N-1\) that goes through all those points.
+
+For example, there always exists a degree-4 polynomial curve that can
+perfectly model a dataset of 5 datapoints:
+
+\end{tcolorbox}
+
+However, high model complexity comes with its own set of issues. When
+building the \texttt{vehicles} models above, we trained the models on
+the \emph{entire} dataset and then evaluated their performance on this
+same dataset. In reality, we are likely to instead train the model on a
+\emph{sample} from the population, then use it to make predictions on
+data it didn't encounter during training.
+
+Let's walk through a more realistic example. Say we are given a training
+dataset of just 6 datapoints and want to train a model to then make
+predictions on a \emph{different} set of points. We may be tempted to
+make a highly complex model (e.g., degree 5), especially given it makes
+perfect predictions on the training data as clear on the left. However,
+as shown in the graph on the right, this model would perform
+\emph{horribly} on the rest of the population!
+
+This phenomenon called \textbf{overfitting}. The model effectively just
+memorized the training data it encountered when it was fitted, leaving
+it unable to \textbf{generalize} well to data it didn't encounter during
+training. This is a problem: we want models that are generalizable to
+``unseen'' data.
+
+Additionally, since complex models are sensitive to the specific dataset
+used to train them, they have high \textbf{variance}. A model with high
+variance tends to \emph{vary} more dramatically when trained on
+different datasets. Going back to our example above, we can see our
+degree-5 model varies erratically when we fit it to different samples of
+6 points from \texttt{vehicles}.
+
+We now face a dilemma: we know that we can \textbf{decrease training
+error} by increasing model complexity, but models that are \emph{too}
+complex start to overfit and can't be reapplied to new datasets due to
+\textbf{high variance}.
+
+We can see that there is a clear trade-off that comes from the
+complexity of our model. As model complexity increases, the model's
+error on the training data decreases. At the same time, the model's
+variance tends to increase.
+
+The takeaway here: we need to strike a balance in the complexity of our
+models; we want models that are generalizable to ``unseen'' data. A
+model that is too simple won't be able to capture the key relationships
+between our variables of interest; a model that is too complex runs the
+risk of overfitting.
+
+This begs the question: how do we control the complexity of a model?
+Stay tuned for Lecture 17 on Cross-Validation and Regularization!
+
+\section{\texorpdfstring{{[}Bonus{]} Stochastic Gradient Descent in
+\texttt{PyTorch}}{{[}Bonus{]} Stochastic Gradient Descent in PyTorch}}\label{bonus-stochastic-gradient-descent-in-pytorch}
+
+While this material is out of scope for Data 100, it is useful if you
+plan to enter a career in data science!
+
+In practice, you will use software packages such as \texttt{PyTorch}
+when computing gradients and implementing gradient descent. You'll often
+follow three main steps:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Sample a batch of the data.
+\item
+  Compute the loss and the gradient.
+\item
+  Update your gradient until you reach an appropriate estimate of the
+  true gradient.
+\end{enumerate}
+
+If you want to learn more, this
+\href{https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html}{Intro
+to PyTorch tutorial} is a great resource to get started!
+
+\bookmarksetup{startatroot}
+
+\chapter{Case Study in Human Contexts and
+Ethics}\label{case-study-in-human-contexts-and-ethics}
+
+\textbf{Note:} Given the nuanced nature of some of the arguments made in
+the lecture, it is highly recommended that you view the lecture
+recording given by Professor Ari Edmundson to fully engage and
+understand the material. The course notes will have the same broader
+structure but are by no means comprehensive.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Learn about the ethical dilemmas that data scientists face.
+\item
+  Examine the Cook County Assessor's Office and Property Appraisal case
+  study for fairness in housing appraisal.
+\item
+  Know how to critique models using contextual knowledge about data.
+\end{itemize}
+
+\end{tcolorbox}
+
+\begin{quote}
+\textbf{Disclaimer}: The following note discusses issues of structural
+racism. Some of the items in this note may be sensitive and may or may
+not be the opinions, ideas, and beliefs of the students who collected
+the materials. The Data 100 course staff tries its best to only present
+information that is relevant for teaching the lessons at hand.
+\end{quote}
+
+As data scientists, our goal is to wrangle data, recognize patterns and
+use them to make predictions within a certain context. However, it is
+often easy to abstract data away from its original context. In previous
+lectures, we've explored datasets like \texttt{elections},
+\texttt{babynames}, and \texttt{world\_bank} to learn fundamental
+techniques for working with data, but rarely do we stop to ask questions
+like ``How/when was this data collected?'' or ``Are there any inherent
+biases in the data that could affect results?''. It turns out that
+inquiries like these profoundly affect how data scientists approach a
+task and convey their findings. This lecture explores these ethical
+dilemmas through the lens of a case study.
+
+Let's immerse ourselves in the real-world story of data scientists
+working for an organization called the Cook County Assessor's Office
+(CCAO) located in Chicago, Illinois. Their job is to \textbf{estimate
+the values of houses} in order to \textbf{assign property taxes}. This
+is because the tax burden in this area is determined by the estimated
+\textbf{value} of a house rather than its price. Since value changes
+over time and has no obvious indicators, the CCAO created a
+\textbf{model} to estimate the values of houses. In this note, we will
+dig deep into biases that arose in the model, the consequences to human
+lives, and what we can learn from this example to avoid the same
+mistakes in the future.
+
+\section{The Problem}\label{the-problem}
+
+What prompted the formation of the CCAO and led to the development of
+this model? In 2017, an
+\href{https://apps.chicagotribune.com/news/watchdog/cook-county-property-tax-divide/assessments.html}{investigative
+report} by the \emph{Chicago Tribune} uncovered a major scandal in the
+property assessment system managed by the CCAO under the watch of former
+County Assessor Joseph Berrios. Working with experts from the University
+of Chicago, the \emph{Chicago Tribune} journalists found that the CCAO's
+model for estimating house value perpetuated a highly regressive tax
+system that disproportionately burdened African-American and Latinx
+homeowners in Cook County. How did the journalists demonstrate this
+disparity?
+
+The image above shows two standard metrics to estimate the fairness of
+assessments: the
+\href{https://www.realestateagent.com/real-estate-glossary/real-estate/coefficient-of-dispersion.html}{coefficient
+of dispersion} and
+\href{https://leg.wa.gov/House/Committees/FIN/Documents/2009/RatioText.pdf}{price-related
+differential}. How they're calculated is out of scope for this class,
+but you can assume that these metrics have been rigorously tested by
+experts in the field and are a good indication of fairness. As we see
+above, calculating these metrics for the Cook County prices revealed
+that the pricing created by the CCAO did not fall in acceptable ranges.
+While this on its own is \textbf{not the entire} story, it was a good
+indicator that \textbf{something fishy was going on}.
+
+This prompted journalists to investigate if the CCAO's model itself was
+producing fair tax rates. When accounting for the homeowner's income,
+they found that the model actually produced a \textbf{regressive} tax
+rate (see figure above). A tax rate is \textbf{regressive} if the
+percentage tax rate is higher for individuals with lower net income; it
+is \textbf{progressive} if the percentage tax rate is higher for
+individuals with higher net income.
+
+Digging further, journalists found that the model was not only
+regressive and unfair to lower-income individuals, but it was also
+unfair to non-white homeowners (see figure above). The likelihood of a
+property being under- or over-assessed was highly dependent on the
+owner's race, and that did not sit well with many homeowners.
+
+\subsection{Spotlight: Appeals}\label{spotlight-appeals}
+
+What was the cause of such a major issue? It might be easy to simply
+blame ``biased'' algorithms, but the main issue was not a faulty model.
+Instead, it was largely due to the \textbf{appeals system} which enabled
+the wealthy and privileged to more easily and successfully challenge
+their assessments. Once given the CCAO model's initial assessment of
+their home's value, homeowners could choose to appeal to a board of
+elected officials to try and change the listed value of their home and,
+consequently, how much they are taxed. In theory, this sounds like a
+very fair system: a human being oversees the final pricing of houses
+rather than a computer algorithm. In reality, this ended up exacerbating
+the problem.
+
+\begin{quote}
+``Appeals are a good thing,'' Thomas Jaconetty, deputy assessor for
+valuation and appeals, said in an interview. ``The goal here is
+fairness. We made the numbers. We can change them.''
+\end{quote}
+
+We can borrow lessons from
+\href{https://www.britannica.com/topic/critical-race-theory}{Critical
+Race Theory} ------ on the surface, everyone has the legal right to try
+and appeal the value of their home. However, not everyone has an
+\emph{equal ability} to do so. Those who have the money to hire tax
+lawyers to appeal for them have a drastically higher chance of trying
+and succeeding in their appeal (see above figure). Many homeowners who
+appealed were generally under-assessed compared to homeowners who did
+not (see figure below). Clearly, the model is part of a deeper
+institutional pattern rife with potential corruption.
+
+In fact, Chicago boasts a large and thriving tax attorney industry
+dedicated precisely to appealing property assessments, reflected in the
+growing number of appeals in Cook County in the 21st century. Given
+wealthier, whiter neighborhoods typically have greater access to
+lawyers, they often appealed more and won reductions far more often than
+their less wealthy neighbors. In other words, those with higher incomes
+pay less in property tax, tax lawyers can grow their business due to
+their role in appeals, and politicians are socially connected to the
+aforementioned tax lawyers and wealthy homeowners. All these
+stakeholders have reasons to advertise the appeals system as an integral
+part of a fair system; after all, it serves to benefit them. Here lies
+the value in asking questions: a system that seems fair on the surface
+may, in reality, be unfair upon taking a closer look.
+
+\subsection{Human Impacts}\label{human-impacts}
+
+What happened as a result of this corrupt system? As the \emph{Chicago
+Tribune} reported, many African American and Latino homeowners purchased
+homes only to find their houses were later appraised at levels far
+higher than what they paid. As a result, homeowners were now responsible
+for paying significantly more in taxes every year than initially
+budgeted, putting them at risk of not being able to afford their homes
+and losing them.
+
+The impact of the housing model extends beyond the realm of home
+ownership and taxation ------ the issues of justice go much deeper. This
+model perpetrated much older patterns of racially discriminatory
+practices in Chicago and across the United States. Unfortunately, it is
+no accident that this happened in Chicago, one of the most segregated
+cities in the United States
+(\href{https://fivethirtyeight.com/features/the-most-diverse-cities-are-often-the-most-segregated/}{source}).
+These factors are central to informing us, as data scientists, about
+what is at stake.
+
+\subsection{Spotlight: Intersection of Real Estate and
+Race}\label{spotlight-intersection-of-real-estate-and-race}
+
+Before we dive into how the CCAO used data science to ``solve'' this
+problem, let's briefly go through the history of discriminatory housing
+practices in the United States to give more context on the gravity and
+urgency of this situation.
+
+Housing and real estate, among other factors, have been one of the most
+significant and enduring drivers of structural racism and racial
+inequality in the United States since the Civil War. It is one of the
+main areas where inequalities are created and reproduced. In the early
+20th century,
+\href{https://www.history.com/topics/early-20th-century-us/jim-crow-laws}{Jim
+Crow} laws were explicit in forbidding people of color from utilizing
+the same facilities ------ such as buses, bathrooms, and pools ------ as
+white individuals. This set of practices by government actors in
+combination with overlapping practices driven by the private real estate
+industry further served to make neighborhoods increasingly segregated.
+
+Although advancements in civil rights have been made, the spirit of the
+laws is alive in many parts of the US. In the 1920s and 1930s, it was
+illegal for governments to actively segregate neighborhoods according to
+race, but other methods were available for achieving the same ends. One
+of the most notorious practices was \textbf{redlining}: the federal
+housing agencies' process of distinguishing neighborhoods in a city in
+terms of relative risk. The goal was to increase access to homeownership
+for low-income Americans. In practice, however, it allowed real estate
+professionals to legally perpetuate segregation. The federal housing
+agencies deemed predominantly African American neighborhoods as high
+risk and colored them in red ------ hence the name redlining
+------~making it nearly impossible for African Americans to own a home.
+
+The origins of the data that made these maps possible lay in a kind of
+``racial data revolution'' in the private real estate industry beginning
+in the 1920s. Segregation was established and reinforced in part through
+the work of real estate agents who were also very concerned with
+establishing reliable methods for predicting the value of a home. The
+effects of these practices continue to resonate today.
+
+Source: Colin Koopman, How We Became Our Data (2019) p.~137
+
+\section{The Response: Cook County Open Data
+Initiative}\label{the-response-cook-county-open-data-initiative}
+
+The response to this problem started in politics. A new assessor, Fritz
+Kaegi, was elected and created a new mandate with two goals:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Distributional equity in property taxation, meaning that properties of
+  the same value are treated alike during assessments.
+\item
+  Creating a new Office of Data Science.
+\end{enumerate}
+
+He wanted to not only create a more accurate algorithmic model but also
+to design a new system to address the problems with the CCAO.
+
+Let's frame this problem through the lens of the data science lifecycle.
+
+\subsection{1. Question/Problem
+Formulation}\label{questionproblem-formulation}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  What do we want to know?
+\item
+  What problems are we trying to solve?
+\item
+  What are the hypotheses we want to test?
+\item
+  What are our metrics for success?
+\end{itemize}
+
+\end{tcolorbox}
+
+The old system was unfair because it was systemically inaccurate; it
+made one kind of error for one group, and another kind of error for
+another. Its goal was to ``create a robust pipeline that accurately
+assesses property values at scale and is fair'', and in turn, they
+defined fairness as accuracy: ``the ability of our pipeline to
+accurately assess all residential property values, accounting for
+disparities in geography, information, etc.'' Thus, the plan ------ make
+the system more fair ------ was already framed in terms of a task
+appropriate to a data scientist: make the assessments more accurate (or
+more precisely, minimize errors in a particular way).
+
+The idea here is that if the model is more accurate it will also
+(perhaps necessarily) become more fair, which is a big assumption. There
+are, in a sense, two different problems ------ make accurate
+assessments, and make a fair system. Treating these two problems as one
+makes it a more straightforward issue that can be solved technically
+(with a good model) but does raise the question of if fairness and
+accuracy are one and the same.
+
+For now, let's just talk about the technical part of this ------
+accuracy. For you, the data scientist, this part might feel more
+comfortable. We can determine some metrics of success and frame a social
+problem as a data science problem.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Definitions: Fairness and Transparency}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+The definitions, as given by the Cook County Assessor's Office, are
+given below:
+
+\begin{itemize}
+\tightlist
+\item
+  Fairness: The ability of our pipeline to accurately assess property
+  values, accounting for disparities in geography, information, etc.
+\item
+  Transparency: The ability of the data science department to share and
+  explain pipeline results and decisions to both internal and external
+  stakeholders
+\end{itemize}
+
+\end{tcolorbox}
+
+The new Office of Data Science started by framing the problem and
+redefining their goals. They determined that they needed to:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Accurately, uniformly, and impartially assess the value of a home and
+  accurately predict the sale price of a home within the next year by:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Following international standards (e.g., coefficient of dispersion)
+  \item
+    Predicting the value of all homes with as little total error as
+    possible
+  \end{itemize}
+\item
+  Create a robust pipeline that accurately assesses property values at
+  scale and is fair to all people by:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Disrupting the circuit of corruption (Board of Review appeals
+    process)
+  \item
+    Eliminating regressivity
+  \item
+    Engendering trust in the system among all stakeholders
+  \end{itemize}
+\end{enumerate}
+
+The goals defined above lead us to ask the question: what does it
+actually mean to accurately assess property values, and what role does
+``scale'' play?
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  What is an assessment of a home's value?
+\item
+  What makes one assessment more accurate than another?
+\item
+  What makes one batch of assessments more accurate than another batch?
+\end{enumerate}
+
+Each of the above questions leads to a slew of more questions.
+Considering just the first question, one answer could be that an
+assessment is an estimate of the value of a home. This leads to more
+inquiries: what is the value of a home? What determines it? How do we
+know? For this class, we take it to be the house's market value, or how
+much it would sell for.
+
+Unfortunately, if you are the county assessor, it becomes hard to
+determine property values with this definition. After all, you can't
+make everyone sell their house every year. And as many properties
+haven't been sold in decades, every year that passes makes that previous
+sale less reliable as an indicator.
+
+So how would one generate reliable estimates? You're probably thinking,
+well, with data about homes and their sale prices you can probably
+predict the value of a property reliably. Even if you're not a data
+scientist, you might know there are websites like Zillow and RedFin that
+estimate what properties would sell for and constantly update them. They
+don't know the value, but they estimate them. How do you think they do
+this? Let's start with the data ------ which is the next step in the
+lifecycle.
+
+\subsection{2. Data Acquisition and
+Cleaning}\label{data-acquisition-and-cleaning}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  What data do we have, and what data do we need?
+\item
+  How will we sample more data?
+\item
+  Is our data representative of the population we want to study?
+\end{itemize}
+
+\end{tcolorbox}
+
+To generate estimates, the data scientists used two datasets. The first
+contained all recorded sales data from 2013 to 2019. The second
+contained property characteristics, including a property identification
+number and physical characteristics (e.g., age, bedroom, baths, square
+feet, neighborhood, site desirability, etc.).
+
+As they examined the datasets, they asked the questions:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  How was this data collected?
+\item
+  When was this data collected?
+\item
+  Who collected this data?
+\item
+  For what purposes was the data collected?
+\item
+  How and why were particular categories created?
+\end{enumerate}
+
+With so much data available, data scientists worked to see how all the
+different data points correlated with each other and with the sales
+prices. By discovering patterns in datasets containing known sale prices
+and characteristics of similar and nearby properties, training a model
+on this data, and applying it to all the properties without sales data,
+it was now possible to create a linear model that could predict the sale
+price (``fair market value'') of unsold properties.
+
+Some other key questions data scientists asked about the data were:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Are any attributes of a house differentially reported? How might these
+  attributes be differentially reported?
+\item
+  How are ``improvements'' like renovations tracked and updated?
+\item
+  Which data is missing, and for which neighborhoods or populations is
+  it missing?
+\item
+  What other data sources or attributes might be valuable?
+\end{enumerate}
+
+Attributes can have different likelihoods of appearing in the data. For
+example, housing data in the floodplain geographic region of Chicago
+were less represented than other regions.
+
+Features can also be reported at different rates. Improvements in homes,
+which tend to increase property value, were unlikely to be reported by
+the homeowners.
+
+Additionally, they found that there was simply more missing data in
+lower-income neighborhoods.
+
+\subsection{3. Exploratory Data
+Analysis}\label{exploratory-data-analysis}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  How is our data organized, and what does it contain?
+\item
+  Do we already have relevant data?
+\item
+  What are the biases, anomalies, or other issues with the data?
+\item
+  How do we transform the data to enable effective analysis?
+\end{itemize}
+
+\end{tcolorbox}
+
+Before the modeling step, they investigated a multitude of crucial
+questions:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Which attributes are most predictive of sales price?
+\item
+  Is the data uniformly distributed?
+\item
+  Do all neighborhoods have recent data? Do all neighborhoods have the
+  same granularity?\\
+\item
+  Do some neighborhoods have missing or outdated data?
+\end{enumerate}
+
+They found that certain features, such as bedroom number, were much more
+useful in determining house value for certain neighborhoods than for
+others. This informed them that different models should be used
+depending on the neighborhood.
+
+They also noticed that low-income neighborhoods had disproportionately
+spottier data. This informed them that they needed to develop new data
+collection practices - including finding new sources of data.
+
+\subsection{4. Prediction and Inference}\label{prediction-and-inference}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  What does the data say about the world?
+\item
+  Does it answer our questions or accurately solve the problem?
+\item
+  How robust are our conclusions, and can we trust the predictions?
+\end{itemize}
+
+\end{tcolorbox}
+
+Rather than using a singular model to predict sale prices (``fair market
+value'') of unsold properties, the CCAO predicts sale prices using
+machine learning models that discover patterns in data sets containing
+known sale prices and characteristics of \textbf{similar and nearby
+properties}. It uses different model weights for each neighborhood.
+
+Compared to traditional mass appraisal, the CCAO's new approach is more
+granular and more sensitive to neighborhood variations.
+
+But how do we know if an assessment is accurate? We can see how our
+model performs when predicting the sales prices of properties it wasn't
+trained on! We can then evaluate how ``close'' our estimate was to the
+actual sales price, using Root Mean Square Error (RMSE). However, is
+RMSE a good proxy for fairness in this context?
+
+Broad metrics of error like RMSE can be limiting when evaluating the
+``fairness'' of a property appraisal system. RMSE does not tell us
+anything about the distribution of errors, whether the errors are
+positive or negative, and the relative size of the errors. It does not
+tell us anything about the regressivity of the model, instead just
+giving a rough measure of our model's overall error.
+
+Even with a low RMSE, we can't guarantee a fair model. The error we see
+(no matter how small) may be a result of our model overvaluing less
+expensive homes and undervaluing more expensive homes.
+
+Regarding accuracy, it's important to ask what makes a batch of
+assessments better or more accurate than another batch of assessments.
+The value of a home that a model predicts is relational. It's a product
+of the interaction of social and technical elements so property
+assessment involves social trust.
+
+Why should any particular individual believe that the model is accurate
+for their property? Why should any individual trust the model?
+
+To foster public trust, the CCAO focuses on ``transparency'', putting
+data, models, and the pipeline onto GitLab. By doing so, they can better
+equate the production of ``accurate assessments'' with ``fairness''.
+
+There's a lot more to be said here on the relationship between accuracy,
+fairness, and metrics we tend to use when evaluating our models. Given
+the nuanced nature of the argument, it is recommended you view the
+corresponding lecture as the course notes are not as comprehensive for
+this portion of the lecture.
+
+\subsection{5. Results and Conclusions}\label{results-and-conclusions}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Driving Questions}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  How successful is the system for each goal?
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Accuracy/uniformity of the model
+  \item
+    Fairness and transparency that eliminates regressivity and engenders
+    trust
+  \end{itemize}
+\item
+  How do you know?
+\end{itemize}
+
+\end{tcolorbox}
+
+Unfortunately, it may be naive to hope that a more accurate and
+transparent algorithm will translate into more fair outcomes in
+practice. Even if our model is perfectly optimized according to the
+standards of fairness we've set, there is no guarantee that people will
+actually pay their expected share of taxes as determined by the model.
+While it is a good step in the right direction, maintaining a level of
+social trust is key to ensuring people pay their fair share.
+
+Despite all their best efforts, the CCAO is still struggling to create
+fair assessments and engender trust.
+
+Stories like
+\href{https://www.axios.com/local/chicago/2022/12/01/why-chicagos-property-tax-bills-so-high}{the
+one} show that total taxes for residential properties went up overall
+(because commercial taxes went down). But looking at the distribution,
+we can see that the biggest increases occurred in wealthy neighborhoods,
+and the biggest decreases occurred in poorer, predominantly Black
+neighborhoods. So maybe there was some success after all?
+
+However, it'll ultimately be hard to overcome the propensity of the
+board of review to reduce the tax burden of the rich, preventing the
+CCAO from creating a truly fair system. This is in part because there
+are many cases where the model makes big, frustrating mistakes. In some
+cases like
+\href{https://www.axios.com/local/chicago/2023/05/22/cook-county-property-tax-appeal-process}{this
+one}, it is due to spotty data.
+
+\section{Summary: Questions to
+Consider}\label{summary-questions-to-consider}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  Question/Problem Formulation
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Who is responsible for framing the problem?
+  \item
+    Who are the stakeholders? How are they involved in the problem
+    framing?
+  \item
+    What do you bring to the table? How does your positionality affect
+    your understanding of the problem?
+  \item
+    What are the narratives that you're tapping into?
+  \end{itemize}
+\item
+  Data Acquisition and Cleaning
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Where does the data come from?
+  \item
+    Who collected it? For what purpose?
+  \item
+    What kinds of collecting and recording systems and techniques were
+    used?
+  \item
+    How has this data been used in the past?
+  \item
+    What restrictions are there on access to the data, and what enables
+    you to have access?
+  \end{itemize}
+\item
+  Exploratory Data Analysis \& Visualization
+
+  \begin{itemize}
+  \tightlist
+  \item
+    What kind of personal or group identities have become salient in
+    this data?
+  \item
+    Which variables became salient, and what kinds of relationships do
+    we see between them?
+  \item
+    Do any of the relationships made visible lend themselves to
+    arguments that might be potentially harmful to a particular
+    community?
+  \end{itemize}
+\item
+  Prediction and Inference
+
+  \begin{itemize}
+  \tightlist
+  \item
+    What does the prediction or inference do in the world?
+  \item
+    Are the results useful for the intended purposes?
+  \item
+    Are there benchmarks to compare the results?
+  \item
+    How are your predictions and inferences dependent upon the larger
+    system in which your model works?
+  \end{itemize}
+\end{enumerate}
+
+\section{Key Takeaways}\label{key-takeaways}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Accuracy is a necessary, but not sufficient, condition of a fair
+  system.
+\item
+  Fairness and transparency are context-dependent and
+  \textbf{sociotechnical} concepts.
+\item
+  Learn to work with contexts, and consider how your data analysis will
+  reshape them.
+\item
+  Keep in mind the power, and limits, of data analysis.
+\end{enumerate}
+
+\bookmarksetup{startatroot}
+
+\chapter{Cross Validation and
+Regularization}\label{cross-validation-and-regularization}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Recognize the need for validation and test sets to preview model
+  performance on unseen data
+\item
+  Apply cross-validation to select model hyperparameters
+\item
+  Understand the conceptual basis for L1 and L2 regularization
+\end{itemize}
+
+\end{tcolorbox}
+
+At the end of the Feature Engineering lecture (Lecture 14), we arrived
+at the issue of fine-tuning model complexity. We identified that a model
+that's too complex can lead to overfitting while a model that's too
+simple can lead to underfitting. This brings us to a natural question:
+how do we control model complexity to avoid under- and overfitting?
+
+To answer this question, we will need to address two things: first, we
+need to understand \emph{when} our model begins to overfit by assessing
+its performance on unseen data. We can achieve this through
+\textbf{cross-validation}. Secondly, we need to introduce a technique to
+adjust the complexity of our models ourselves -- to do so, we will apply
+\textbf{regularization}.
+
+\section{Cross-validation}\label{cross-validation}
+
+\subsection{Training, Test, and Validation
+Sets}\label{training-test-and-validation-sets}
+
+From the last lecture, we learned that \emph{increasing} model
+complexity \emph{decreased} our model's training error but
+\emph{increased} its variance. This makes intuitive sense: adding more
+features causes our model to fit more closely to data it encountered
+during training, but it generalizes worse to new data that hasn't been
+seen before. For this reason, a low training error is not always
+representative of our model's underlying performance -- we need to also
+assess how well it performs on unseen data to ensure that it is not
+overfitting.
+
+Truly, the only way to know when our model overfits is by evaluating it
+on unseen data. Unfortunately, that means we need to wait for more data.
+This may be very expensive and time-consuming.
+
+How should we proceed? In this section, we will build up a viable
+solution to this problem.
+
+\subsubsection{Test Sets}\label{test-sets}
+
+The simplest approach to avoid overfitting is to keep some of our data
+``secret'' from ourselves. We can set aside a random portion of our full
+dataset to use \emph{only} for testing purposes. The datapoints in this
+\textbf{test set} will \emph{not} be used to fit the model. Instead, we
+will:
+
+\begin{itemize}
+\tightlist
+\item
+  Use the remaining portion of our dataset -- now called the
+  \textbf{training set} -- to run ordinary least squares, gradient
+  descent, or some other technique to train our model,
+\item
+  Take the fitted model and use it to make predictions on datapoints in
+  the test set. The model's performance on the test set (expressed as
+  the MSE, RMSE, etc.) is now indicative of how well it can make
+  predictions on \emph{unseen} data
+\end{itemize}
+
+Importantly, the optimal model parameters were found by \emph{only}
+considering the data in the training set. After the model has been
+fitted to the training data, we do not change any parameters before
+making predictions on the test set. Importantly, we only ever make
+predictions on the test set \textbf{once} after all model design has
+been completely finalized. We treat the test set performance as the
+final test of how well a model does. To reiterate, the test set is only
+ever touched once: to compute the performance of the model after all
+fine-tuning has been completed.
+
+The process of sub-dividing our dataset into training and test sets is
+known as a \textbf{train-test split}. Typically, between 10\% and 20\%
+of the data is allocated to the test set.
+
+In \texttt{sklearn}, the \texttt{train\_test\_split} function
+(\href{https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html}{documentation})
+of the \texttt{model\_selection} module allows us to automatically
+generate train-test splits.
+
+We will work with the \texttt{vehicles} dataset from previous lectures.
+As before, we will attempt to predict the \texttt{mpg} of a vehicle from
+transformations of its \texttt{hp}. In the cell below, we allocate 20\%
+of the full dataset to testing, and the remaining 80\% to training.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\ImportTok{import}\NormalTok{ warnings}
+\NormalTok{warnings.filterwarnings(}\StringTok{\textquotesingle{}ignore\textquotesingle{}}\NormalTok{)}
+
+\CommentTok{\# Load the dataset and construct the design matrix}
+\NormalTok{vehicles }\OperatorTok{=}\NormalTok{ sns.load\_dataset(}\StringTok{"mpg"}\NormalTok{).rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{"horsepower"}\NormalTok{:}\StringTok{"hp"}\NormalTok{\}).dropna()}
+\NormalTok{X }\OperatorTok{=}\NormalTok{ vehicles[[}\StringTok{"hp"}\NormalTok{]]}
+\NormalTok{X[}\StringTok{"hp\^{}2"}\NormalTok{] }\OperatorTok{=}\NormalTok{ vehicles[}\StringTok{"hp"}\NormalTok{]}\OperatorTok{**}\DecValTok{2}
+\NormalTok{X[}\StringTok{"hp\^{}3"}\NormalTok{] }\OperatorTok{=}\NormalTok{ vehicles[}\StringTok{"hp"}\NormalTok{]}\OperatorTok{**}\DecValTok{3}
+\NormalTok{X[}\StringTok{"hp\^{}4"}\NormalTok{] }\OperatorTok{=}\NormalTok{ vehicles[}\StringTok{"hp"}\NormalTok{]}\OperatorTok{**}\DecValTok{4}
+
+\NormalTok{Y }\OperatorTok{=}\NormalTok{ vehicles[}\StringTok{"mpg"}\NormalTok{]}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ sklearn.model\_selection }\ImportTok{import}\NormalTok{ train\_test\_split}
+
+\CommentTok{\# \textasciigrave{}test\_size\textasciigrave{} specifies the proportion of the full dataset that should be allocated to testing}
+\CommentTok{\# \textasciigrave{}random\_state\textasciigrave{} makes our results reproducible for educational purposes}
+\NormalTok{X\_train, X\_test, Y\_train, Y\_test }\OperatorTok{=}\NormalTok{ train\_test\_split(}
+\NormalTok{        X, }
+\NormalTok{        Y, }
+\NormalTok{        test\_size}\OperatorTok{=}\FloatTok{0.2}\NormalTok{, }
+\NormalTok{        random\_state}\OperatorTok{=}\DecValTok{220}
+\NormalTok{    )}
+
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Size of full dataset: }\SpecialCharTok{\{}\NormalTok{X}\SpecialCharTok{.}\NormalTok{shape[}\DecValTok{0}\NormalTok{]}\SpecialCharTok{\}}\SpecialStringTok{ points"}\NormalTok{)}
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Size of training set: }\SpecialCharTok{\{}\NormalTok{X\_train}\SpecialCharTok{.}\NormalTok{shape[}\DecValTok{0}\NormalTok{]}\SpecialCharTok{\}}\SpecialStringTok{ points"}\NormalTok{)}
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Size of test set: }\SpecialCharTok{\{}\NormalTok{X\_test}\SpecialCharTok{.}\NormalTok{shape[}\DecValTok{0}\NormalTok{]}\SpecialCharTok{\}}\SpecialStringTok{ points"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Size of full dataset: 392 points
+Size of training set: 313 points
+Size of test set: 79 points
+\end{verbatim}
+
+After performing our train-test split, we fit a model to the training
+set and assess its performance on the test set.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ sklearn.linear\_model }\ImportTok{as}\NormalTok{ lm}
+\ImportTok{from}\NormalTok{ sklearn.metrics }\ImportTok{import}\NormalTok{ mean\_squared\_error}
+
+\NormalTok{model }\OperatorTok{=}\NormalTok{ lm.LinearRegression()}
+
+\CommentTok{\# Fit to the training set}
+\NormalTok{model.fit(X\_train, Y\_train)}
+
+\CommentTok{\# Calculate errors}
+\NormalTok{train\_error }\OperatorTok{=}\NormalTok{ mean\_squared\_error(Y\_train, model.predict(X\_train))}
+\NormalTok{test\_error }\OperatorTok{=}\NormalTok{ mean\_squared\_error(Y\_test, model.predict(X\_test))}
+
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Training error: }\SpecialCharTok{\{}\NormalTok{train\_error}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Test error: }\SpecialCharTok{\{}\NormalTok{test\_error}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Training error: 17.858516841012097
+Test error: 23.19240562932651
+\end{verbatim}
+
+\subsubsection{Validation Sets}\label{validation-sets}
+
+Now, what if we were dissatisfied with our test set performance? With
+our current framework, we'd be stuck. As outlined previously, assessing
+model performance on the test set is the \emph{final} stage of the model
+design process; we can't go back and adjust our model based on the new
+discovery that it is overfitting. If we did, then we would be
+\emph{factoring in information from the test set} to design our model.
+The test error would no longer be a true representation of the model's
+performance on \emph{unseen} data!
+
+Our solution is to introduce a \textbf{validation set}. A validation set
+is a random portion of the \emph{training set} that is set aside for
+assessing model performance while the model is \emph{still being
+developed}. The process for using a validation set is:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Perform a train-test split.
+\item
+  Set the test set aside; we will not touch it until the very end of the
+  model design process.
+\item
+  Set aside a portion of the training set to be used for validation.
+\item
+  Fit the model parameters to the datapoints contained in the remaining
+  portion of the training set.
+\item
+  Assess the model's performance on the validation set. Adjust the model
+  as needed, re-fit it to the remaining portion of the training set,
+  then re-evaluate it on the validation set. Repeat as necessary until
+  you are satisfied.
+\item
+  After \emph{all} model development is complete, assess the model's
+  performance on the test set. This is the final test of how well the
+  model performs on unseen data. No further modifications should be made
+  to the model.
+\end{enumerate}
+
+The process of creating a validation set is called a \textbf{validation
+split}.
+
+Note that the validation error behaves quite differently from the
+training error explored previously. As the model becomes more complex,
+it makes better predictions on the training data; the variance of the
+model typically increases as model complexity increases. Validation
+error, on the other hand, decreases \emph{then increases} as we increase
+model complexity. This reflects the transition from under- to
+overfitting: at low model complexity, the model underfits because it is
+not complex enough to capture the main trends in the data; at high model
+complexity, the model overfits because it ``memorizes'' the training
+data too closely.
+
+We can update our understanding of the relationships between error,
+complexity, and model variance:
+
+Our goal is to train a model with complexity near the orange dotted line
+-- this is where our model minimizes the validation error. Note that
+this relationship is a simplification of the real-world, but it's a good
+enough approximation for the purposes of Data 100.
+
+\subsection{K-Fold Cross-Validation}\label{k-fold-cross-validation}
+
+Introducing a validation set gave us an ``extra'' chance to assess model
+performance on another set of unseen data. We are able to finetune the
+model design based on its performance on this one set of validation
+data.
+
+But what if, by random chance, our validation set just happened to
+contain many outliers? It is possible that the validation datapoints we
+set aside do not actually represent other unseen data that the model
+might encounter. Ideally, we would like to validate our model's
+performance on several different unseen datasets. This would give us
+greater confidence in our understanding of how the model behaves on new
+data.
+
+Let's think back to our validation framework. Earlier, we set aside
+\(x\)\% of our training data (say, 20\%) to use for validation.
+
+In the example above, we set aside the first 20\% of training datapoints
+for the validation set. This was an arbitrary choice. We could have set
+aside \emph{any} 20\% portion of the training data for validation. In
+fact, there are 5 non-overlapping ``chunks'' of training points that we
+could have designated as the validation set.
+
+The common term for one of these chunks is a \textbf{fold}. In the
+example above, we had 5 folds, each containing 20\% of the training
+data. This gives us a new perspective: we really have \emph{5}
+validation sets ``hidden'' in our training set.
+
+In \textbf{cross-validation}, we perform validation splits for each fold
+in the training set. For a dataset with \(K\) folds, we:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Pick one fold to be the validation fold
+\item
+  Fit the model to training data from every fold \emph{other} than the
+  validation fold
+\item
+  Compute the model's error on the validation fold and record it
+\item
+  Repeat for all \(K\) folds
+\end{enumerate}
+
+The \textbf{cross-validation error} is then the \emph{average} error
+across all \(K\) validation folds. In the example below, the
+cross-validation error is the mean of validation errors \#1 to \#5.
+
+\subsection{Model Selection Workflow}\label{model-selection-workflow}
+
+At this stage, we have refined our model selection workflow. We begin by
+performing a train-test split to set aside a test set for the final
+evaluation of model performance. Then, we alternate between adjusting
+our design matrix and computing the cross-validation error to finetune
+the model's design. In the example below, we illustrate the use of
+4-fold cross-validation to help inform model design.
+
+\subsection{Hyperparameters}\label{hyperparameters}
+
+An important use of cross-validation is for \textbf{hyperparameter}
+selection. A hyperparameter is some value in a model that is chosen
+\emph{before} the model is fit to any data. This means that it is
+distinct from the \emph{model parameters}, \(\theta_i\), because its
+value is selected \emph{before} the training process begins. We cannot
+use our usual techniques -- calculus, ordinary least squares, or
+gradient descent -- to choose its value. Instead, we must decide it
+ourselves.
+
+Some examples of hyperparameters in Data 100 are:
+
+\begin{itemize}
+\tightlist
+\item
+  The degree of our polynomial model (recall that we selected the degree
+  before creating our design matrix and calling \texttt{.fit})
+\item
+  The learning rate, \(\alpha\), in gradient descent
+\item
+  The regularization penalty, \(\lambda\) (to be introduced later this
+  lecture)
+\end{itemize}
+
+To select a hyperparameter value via cross-validation, we first list out
+several ``guesses'' for what the best hyperparameter may be. For each
+guess, we then run cross-validation to compute the cross-validation
+error incurred by the model when using that choice of hyperparameter
+value. We then select the value of the hyperparameter that resulted in
+the lowest cross-validation error.
+
+For example, we may wish to use cross-validation to decide what value we
+should use for \(\alpha\), which controls the step size of each gradient
+descent update. To do so, we list out some possible guesses for the best
+\(\alpha\), like 0.1, 1, and 10. For each possible value, we perform
+cross-validation to see what error the model has when we use that value
+of \(\alpha\) to train it.
+
+\section{Regularization}\label{regularization}
+
+We've now addressed the first of our two goals for today: creating a
+framework to assess model performance on unseen data. Now, we'll discuss
+our second objective: developing a technique to adjust model complexity.
+This will allow us to directly tackle the issues of under- and
+overfitting.
+
+Earlier, we adjusted the complexity of our polynomial model by tuning a
+hyperparameter -- the degree of the polynomial. We tested out several
+different polynomial degrees, computed the validation error for each,
+and selected the value that minimized the validation error. Tweaking the
+``complexity'' was simple; it was only a matter of adjusting the
+polynomial degree.
+
+In most machine learning problems, complexity is defined differently
+from what we have seen so far. Today, we'll explore two different
+definitions of complexity: the \emph{squared} and \emph{absolute}
+magnitude of \(\theta_i\) coefficients.
+
+\subsection{Constraining Model
+Parameters}\label{constraining-model-parameters}
+
+Think back to our work using gradient descent to descend down a loss
+surface. You may find it helpful to refer back to the Gradient Descent
+note to refresh your memory. Our aim was to find the combination of
+model parameters that the smallest, minimum loss. We visualized this
+using a contour map by plotting possible parameter values on the
+horizontal and vertical axes, which allows us to take a bird's eye view
+above the loss surface. Notice that the contour map has \(p=2\)
+parameters for ease of visualization. We want to find the model
+parameters corresponding to the lowest point on the loss surface.
+
+Let's review our current modeling framework.
+
+\[\hat{\mathbb{Y}} = \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2 + \ldots + \theta_p \phi_p\]
+
+Recall that we represent our features with \(\phi_i\) to reflect the
+fact that we have performed feature engineering.
+
+Previously, we restricted model complexity by limiting the total number
+of features present in the model. We only included a limited number of
+polynomial features at a time; all other polynomials were excluded from
+the model.
+
+What if, instead of fully removing particular features, we kept all
+features and used each one only a ``little bit''? If we put a limit on
+how \emph{much} each feature can contribute to the predictions, we can
+still control the model's complexity without the need to manually
+determine how many features should be removed.
+
+What do we mean by a ``little bit''? Consider the case where some
+parameter \(\theta_i\) is close to or equal to 0. Then, feature
+\(\phi_i\) barely impacts the prediction -- the feature is weighted by
+such a small value that its presence doesn't significantly change the
+value of \(\hat{\mathbb{Y}}\). If we restrict how large each parameter
+\(\theta_i\) can be, we restrict how much feature \(\phi_i\) contributes
+to the model. This has the effect of \emph{reducing} model complexity.
+
+In \textbf{regularization}, we restrict model complexity by putting a
+limit on the \emph{magnitudes} of the model parameters \(\theta_i\).
+
+What do these limits look like? Suppose we specify that the sum of all
+absolute parameter values can be no greater than some number \(Q\). In
+other words:
+
+\[\sum_{i=1}^p |\theta_i| \leq Q\]
+
+where \(p\) is the total number of parameters in the model. You can
+think of this as us giving our model a ``budget'' for how it distributes
+the magnitudes of each parameter. If the model assigns a large value to
+some \(\theta_i\), it may have to assign a small value to some other
+\(\theta_j\). This has the effect of increasing feature \(\phi_i\)'s
+influence on the predictions while decreasing the influence of feature
+\(\phi_j\). The model will need to be strategic about how the parameter
+weights are distributed -- ideally, more ``important'' features will
+receive greater weighting.
+
+Notice that the intercept term, \(\theta_0\), is excluded from this
+constraint. \textbf{We typically do not regularize the intercept term}.
+
+Now, let's think back to gradient descent and visualize the loss surface
+as a contour map. As a refresher, a loss surface means that each point
+represents the model's loss for a particular combination of
+\(\theta_1\), \(\theta_2\). Let's say our goal is to find the
+combination of parameters that gives us the lowest loss.
+
+With no constraint, the optimal \(\hat{\theta}\) is in the center. We
+denote this as \(\hat{\theta}_\text{No Reg}\).
+
+Applying this constraint limits what combinations of model parameters
+are valid. We can now only consider parameter combinations with a total
+absolute sum less than or equal to our number \(Q\). For our 2D example,
+the constraint \(\sum_{i=1}^p |\theta_i| \leq Q\) can be rewritten as
+\(|\theta_0| + |\theta_1| \leq Q\). This means that we can only assign
+our \emph{regularized} parameter vector \(\hat{\theta}_{\text{Reg}}\) to
+positions in the green diamond below.
+
+We can no longer select the parameter vector that \emph{truly} minimizes
+the loss surface, \(\hat{\theta}_{\text{No Reg}}\), because this
+combination of parameters does not lie within our allowed region.
+Instead, we select whatever allowable combination brings us
+\emph{closest} to the true minimum loss, which is depicted by the red
+point below.
+
+Notice that, under regularization, our optimized \(\theta_1\) and
+\(\theta_2\) values are much smaller than they were without
+regularization (indeed, \(\theta_1\) has decreased to 0). The model has
+\emph{decreased in complexity} because we have limited how much our
+features contribute to the model. In fact, by setting its parameter to
+0, we have effectively removed the influence of feature \(\phi_1\) from
+the model altogether.
+
+If we change the value of \(Q\), we change the region of allowed
+parameter combinations. The model will still choose the combination of
+parameters that produces the lowest loss -- the closest point in the
+constrained region to the true minimizer,
+\(\hat{\theta}_{\text{No Reg}}\).
+
+When \(Q\) is small, we severely restrict the size of our parameters.
+\(\theta_i\)s are small in value, and features \(\phi_i\) only
+contribute a little to the model. The allowed region of model parameters
+contracts, and the model becomes much simpler:
+
+When \(Q\) is large, we do not restrict our parameter sizes by much.
+\(\theta_i\)s are large in value, and features \(\phi_i\) contribute
+more to the model. The allowed region of model parameters expands, and
+the model becomes more complex:
+
+Consider the extreme case of when \(Q\) is extremely large. In this
+situation, our restriction has essentially no effect, and the allowed
+region includes the OLS solution!
+
+Now what if \(Q\) was extremely small? Most parameters are then set to
+(essentially) 0.
+
+\begin{itemize}
+\tightlist
+\item
+  If the model has no intercept term:
+  \(\hat{\mathbb{Y}} = (0)\phi_1 + (0)\phi_2 + \ldots = 0\).
+\item
+  If the model has an intercept term:
+  \(\hat{\mathbb{Y}} = (0)\phi_1 + (0)\phi_2 + \ldots = \theta_0\).
+  Remember that the intercept term is excluded from the constraint -
+  this is so we avoid the situation where we always predict 0.
+\end{itemize}
+
+Let's summarize what we have seen.
+
+\subsection{L1 (LASSO) Regularization}\label{l1-lasso-regularization}
+
+How do we actually apply our constraint
+\(\sum_{i=1}^p |\theta_i| \leq Q\)? We will do so by modifying the
+\emph{objective function} that we seek to minimize when fitting a model.
+
+Recall our ordinary least squares objective function: our goal was to
+find parameters that minimize the model's mean squared error:
+
+\[\frac{1}{n} \sum_{i=1}^n (y_i - \hat{y}_i)^2 = \frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2\]
+
+To apply our constraint, we need to rephrase our minimization goal as:
+
+\[\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2\:\text{such that} \sum_{i=1}^p |\theta_i| \leq Q\]
+
+Unfortunately, we can't directly use this formulation as our objective
+function -- it's not easy to mathematically optimize over a constraint.
+Instead, we will apply the magic of the
+\href{https://en.wikipedia.org/wiki/Duality_(optimization)}{Lagrangian
+Duality}. The details of this are out of scope (take EECS 127 if you're
+interested in learning more), but the end result is very useful. It
+turns out that minimizing the following \emph{augmented} objective
+function is \emph{equivalent} to our minimization goal above.
+
+\[\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2 + \lambda \sum_{i=1}^p \vert \theta_i \vert\]
+\[ = \frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda \sum_{i=1}^p |\theta_i|\]
+\[ = \frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda || \theta ||_1\]
+
+The last two expressions include the MSE expressed using vector
+notation, and the last expression writes \(\sum_{i=1}^p |\theta_i|\) as
+it's \textbf{L1 norm} equivalent form, \(|| \theta ||_1\).
+
+Notice that we've replaced the constraint with a second term in our
+objective function. We're now minimizing a function with an additional
+regularization term that \emph{penalizes large coefficients}. In order
+to minimize this new objective function, we'll end up balancing two
+components:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Keeping the model's error on the training data low, represented by the
+  term
+  \(\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 x_{i, 1} + \theta_2 x_{i, 2} + \ldots + \theta_p x_{i, p}))^2\)
+\item
+  Keeping the magnitudes of model parameters low, represented by the
+  term \(\lambda \sum_{i=1}^p |\theta_i|\)
+\end{enumerate}
+
+The \(\lambda\) factor controls the degree of regularization. Roughly
+speaking, \(\lambda\) is related to our \(Q\) constraint from before by
+the rule \(\lambda \approx \frac{1}{Q}\). To understand why, let's
+consider two extreme examples. Recall that our goal is to minimize the
+cost function:
+\(\frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda || \theta ||_1\).
+
+\begin{itemize}
+\item
+  Assume \(\lambda \rightarrow \infty\). Then,
+  \(\lambda || \theta ||_1\) dominates the cost function. In order to
+  neutralize the \(\infty\) and minimize this term, we set
+  \(\theta_j = 0\) for all \(j \ge 1\). This is a very constrained model
+  that is mathematically equivalent to the constant model
+\item
+  Assume \(\lambda \rightarrow 0\). Then, \(\lambda || \theta ||_1=0\).
+  Minimizing the cost function is equivalent to minimizing
+  \(\frac{1}{n} || Y - X\theta ||_2^2\), our usual MSE loss function.
+  The act of minimizing MSE loss is just our familiar OLS, and the
+  optimal solution is the global minimum
+  \(\hat{\theta} = \hat\theta_{No Reg.}\).
+\end{itemize}
+
+We call \(\lambda\) the \textbf{regularization penalty hyperparameter};
+it needs to be determined \emph{prior} to training the model, so we must
+find the best value via cross-validation.
+
+The process of finding the optimal \(\hat{\theta}\) to minimize our new
+objective function is called \textbf{L1 regularization}. It is also
+sometimes known by the acronym ``LASSO'', which stands for ``Least
+Absolute Shrinkage and Selection Operator.''
+
+Unlike ordinary least squares, which can be solved via the closed-form
+solution
+\(\hat{\theta}_{OLS} = (\mathbb{X}^{\top}\mathbb{X})^{-1}\mathbb{X}^{\top}\mathbb{Y}\),
+\textbf{there is no closed-form solution for the optimal parameter
+vector under L1 regularization}. Instead, we use the \texttt{Lasso}
+model class of \texttt{sklearn}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ sklearn.linear\_model }\ImportTok{as}\NormalTok{ lm}
+
+\CommentTok{\# The alpha parameter represents our lambda term}
+\NormalTok{lasso\_model }\OperatorTok{=}\NormalTok{ lm.Lasso(alpha}\OperatorTok{=}\DecValTok{2}\NormalTok{)}
+\NormalTok{lasso\_model.fit(X\_train, Y\_train)}
+
+\NormalTok{lasso\_model.coef\_}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([-2.54932056e-01, -9.48597165e-04,  8.91976284e-06, -1.22872290e-08])
+\end{verbatim}
+
+Notice that all model coefficients are very small in magnitude. In fact,
+some of them are so small that they are essentially 0. An important
+characteristic of L1 regularization is that many model parameters are
+set to 0. In other words, LASSO effectively \textbf{selects only a
+subset} of the features. The reason for this comes back to our loss
+surface and allowed ``diamond'' regions from earlier -- we can often get
+closer to the lowest loss contour at a corner of the diamond than along
+an edge.
+
+When a model parameter is set to 0 or close to 0, its corresponding
+feature is essentially removed from the model. We say that L1
+regularization performs \textbf{feature selection} because, by setting
+the parameters of unimportant features to 0, LASSO ``selects'' which
+features are more useful for modeling. L1 regularization indicates that
+the features with non-zero parameters are more informative for modeling
+than those with parameters set to zero.
+
+\subsection{Scaling Features for
+Regularization}\label{scaling-features-for-regularization}
+
+The regularization procedure we just performed had one subtle issue. To
+see what it is, let's take a look at the design matrix for our
+\texttt{lasso\_model}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{X\_train.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& hp & hp\^{}2 & hp\^{}3 & hp\^{}4 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+259 & 85.0 & 7225.0 & 614125.0 & 52200625.0 \\
+129 & 67.0 & 4489.0 & 300763.0 & 20151121.0 \\
+207 & 102.0 & 10404.0 & 1061208.0 & 108243216.0 \\
+302 & 70.0 & 4900.0 & 343000.0 & 24010000.0 \\
+71 & 97.0 & 9409.0 & 912673.0 & 88529281.0 \\
+\end{longtable}
+
+Our features -- \texttt{hp}, \texttt{hp\^{}2}, \texttt{hp\^{}3}, and
+\texttt{hp\^{}4} -- are on drastically different numeric scales! The
+values contained in \texttt{hp\^{}4} are orders of magnitude larger than
+those contained in \texttt{hp}. This can be a problem because the value
+of \texttt{hp\^{}4} will naturally contribute more to each predicted
+\(\hat{y}\) because it is so much greater than the values of the other
+features. For \texttt{hp} to have much of an impact at all on the
+prediction, it must be scaled by a large model parameter.
+
+By inspecting the fitted parameters of our model, we see that this is
+the case -- the parameter for \texttt{hp} is much larger in magnitude
+than the parameter for \texttt{hp\^{}4}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.DataFrame(\{}\StringTok{"Feature"}\NormalTok{:X\_train.columns, }\StringTok{"Parameter"}\NormalTok{:lasso\_model.coef\_\})}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& Feature & Parameter \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & hp & -2.549321e-01 \\
+1 & hp\^{}2 & -9.485972e-04 \\
+2 & hp\^{}3 & 8.919763e-06 \\
+3 & hp\^{}4 & -1.228723e-08 \\
+\end{longtable}
+
+Recall that by applying regularization, we give our a model a ``budget''
+for how it can allocate the values of model parameters. For \texttt{hp}
+to have much of an impact on each prediction, LASSO is forced to
+``spend'' more of this budget on the parameter for \texttt{hp}.
+
+We can avoid this issue by \textbf{scaling} the data before
+regularizing. This is a process where we convert all features to the
+same numeric scale. A common way to scale data is to perform
+\textbf{standardization} such that all features have mean 0 and standard
+deviation 1; essentially, we replace everything with its Z-score.
+
+\[z_i = \frac{x_i - \mu}{\sigma}\]
+
+\subsection{L2 (Ridge) Regularization}\label{l2-ridge-regularization}
+
+In all of our work above, we considered the constraint
+\(\sum_{i=1}^p |\theta_i| \leq Q\) to limit the complexity of the model.
+What if we had applied a different constraint?
+
+In \textbf{L2 regularization}, also known as \textbf{ridge regression},
+we constrain the model such that the sum of the \emph{squared}
+parameters must be less than some number \(Q\). This constraint takes
+the form:
+
+\[\sum_{i=1}^p \theta_i^2 \leq Q\]
+
+As before, \textbf{we typically do not regularize the intercept term}.
+
+In our 2D example, the constraint becomes
+\(\theta_1^2 + \theta_2^2 \leq Q\). Can you see how this is similar to
+the equation for a circle, \(x^2 + y^2 = r^2\)? The allowed region of
+parameters for a given value of \(Q\) is now shaped like a ball.
+
+If we modify our objective function like before, we find that our new
+goal is to minimize the function:
+\[\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2\:\text{such that} \sum_{i=1}^p \theta_i^2 \leq Q\]
+
+Notice that all we have done is change the constraint on the model
+parameters. The first term in the expression, the MSE, has not changed.
+
+Using Lagrangian Duality (again, out of scope for Data 100), we can
+re-express our objective function as:
+\[\frac{1}{n} \sum_{i=1}^n (y_i - (\theta_0 + \theta_1 \phi_{i, 1} + \theta_2 \phi_{i, 2} + \ldots + \theta_p \phi_{i, p}))^2 + \lambda \sum_{i=1}^p \theta_i^2\]
+\[= \frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda \sum_{i=1}^p \theta_i^2\]
+\[= \frac{1}{n}||\mathbb{Y} - \mathbb{X}\theta||_2^2 + \lambda || \theta ||_2^2\]
+
+The last two expressions include the MSE expressed using vector
+notation, and the last expression writes \(\sum_{i=1}^p \theta_i^2\) as
+it's \textbf{L2 norm} equivalent form, \(|| \theta ||_2^2\).
+
+When applying L2 regularization, our goal is to minimize this updated
+objective function.
+
+Unlike L1 regularization, L2 regularization \emph{does} have a
+closed-form solution for the best parameter vector when regularization
+is applied:
+
+\[\hat\theta_{\text{ridge}} = (\mathbb{X}^{\top}\mathbb{X} + n\lambda I)^{-1}\mathbb{X}^{\top}\mathbb{Y}\]
+
+This solution exists \textbf{even if \(\mathbb{X}\) is not full column
+rank}. This is a major reason why L2 regularization is often used -- it
+can produce a solution even when there is colinearity in the features.
+We will discuss the concept of colinearity in a future lecture, but we
+will not derive this result in Data 100, as it involves a fair bit of
+matrix calculus.
+
+In \texttt{sklearn}, we perform L2 regularization using the
+\texttt{Ridge} class. It runs gradient descent to minimize the L2
+objective function. Notice that we scale the data before regularizing.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{ridge\_model }\OperatorTok{=}\NormalTok{ lm.Ridge(alpha}\OperatorTok{=}\DecValTok{1}\NormalTok{) }\CommentTok{\# alpha represents the hyperparameter lambda}
+\NormalTok{ridge\_model.fit(X\_train, Y\_train)}
+
+\NormalTok{ridge\_model.coef\_}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([ 5.89130559e-02, -6.42445915e-03,  4.44468157e-05, -8.83981945e-08])
+\end{verbatim}
+
+\section{Regression Summary}\label{regression-summary}
+
+Our regression models are summarized below. Note the objective function
+is what the gradient descent optimizer minimizes.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.0605}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1423}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.0534}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.0569}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.3238}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.3630}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+Type
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Model
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Loss
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Regularization
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Objective Function
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+Solution
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+OLS & \(\hat{\mathbb{Y}} = \mathbb{X}\theta\) & MSE & None &
+\(\frac{1}{n} \|\mathbb{Y}-\mathbb{X} \theta\|^2_2\) &
+\(\hat{\theta}_{OLS} = (\mathbb{X}^{\top}\mathbb{X})^{-1}\mathbb{X}^{\top}\mathbb{Y}\)
+if \(\mathbb{X}\) is full column rank \\
+Ridge & \(\hat{\mathbb{Y}} = \mathbb{X} \theta\) & MSE & L2 &
+\(\frac{1}{n} \|\mathbb{Y}-\mathbb{X}\theta\|^2_2 + \lambda \sum_{i=1}^p \theta_i^2\)
+&
+\(\hat{\theta}_{ridge} = (\mathbb{X}^{\top}\mathbb{X} + n \lambda I)^{-1}\mathbb{X}^{\top}\mathbb{Y}\) \\
+LASSO & \(\hat{\mathbb{Y}} = \mathbb{X} \theta\) & MSE & L1 &
+\(\frac{1}{n} \|\mathbb{Y}-\mathbb{X}\theta\|^2_2 + \lambda \sum_{i=1}^p \vert \theta_i \vert\)
+& No closed form solution \\
+\end{longtable}
+
+\bookmarksetup{startatroot}
+
+\chapter{Random Variables}\label{random-variables}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Define a random variable in terms of its distribution
+\item
+  Compute the expectation and variance of a random variable
+\item
+  Gain familiarity with the Bernoulli and binomial random variables
+\end{itemize}
+
+\end{tcolorbox}
+
+In the past few lectures, we've examined the role of complexity in
+influencing model performance. We've considered model complexity in the
+context of a tradeoff between two competing factors: model variance and
+training error.
+
+So far, our analysis has been mostly qualitative. We've acknowledged
+that our choice of model complexity needs to strike a balance between
+model variance and training error, but we haven't yet discussed
+\emph{why} exactly this tradeoff exists.
+
+To better understand the origin of this tradeoff, we will need to dive
+into \textbf{random variables}. The next two course notes on probability
+will be a brief digression from our work on modeling so we can build up
+the concepts needed to understand this so-called \textbf{bias-variance
+tradeoff}. In specific, we will cover:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Random Variables: introduce random variables, considering the concepts
+  of expectation, variance, and covariance
+\item
+  Estimators, Bias, and Variance: re-express the ideas of model variance
+  and training error in terms of random variables and use this new
+  perspective to investigate our choice of model complexity
+\end{enumerate}
+
+We'll go over just enough probability to help you understand its
+implications for modeling, but if you want to go a step further, take
+Data 140, CS 70, and/or EECS 126.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Data 8 Recap}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+Recall the following concepts from Data 8:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  Sample mean: The mean of the random sample
+\item
+  Central Limit Theorem: If you draw a large random sample with
+  replacement, then, regardless of the population distribution, the
+  probability distribution of the sample mean
+
+  \begin{enumerate}
+  \def\labelenumii{\alph{enumii}.}
+  \item
+    is roughly normal
+  \item
+    is centered at the population mean
+  \item
+    has an
+    \(SD = \frac{\text{population SD}}{\sqrt{\text{sample size}}}\)
+  \end{enumerate}
+\end{enumerate}
+
+\end{tcolorbox}
+
+In Data 100, we want to understand the broader relationship between the
+following:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Population parameter}: a number that describes something about
+  the population
+\item
+  \textbf{Sample statistic}: an estimate of the number computed on a
+  sample
+\end{itemize}
+
+\section{Random Variables and
+Distributions}\label{random-variables-and-distributions}
+
+Suppose we generate a set of random data, like a random sample from some
+population. A \textbf{random variable} is a \emph{function} from the
+outcome of a random event to a number.
+
+It is \emph{random} since our sample was drawn at random; it is
+\emph{variable} because its exact value depends on how this random
+sample came out. As such, the domain or input of our random variable is
+all possible outcomes for some random event in a \emph{sample space},
+and its range or output is the real number line. We typically denote
+random variables with uppercase letters, such as \(X\) or \(Y\). In
+contrast, note that regular variables tend to be denoted using lowercase
+letters. Sometimes we also use uppercase letters to refer to matrices
+(such as your design matrix \(\mathbb{X}\)), but we will do our best to
+be clear with the notation.
+
+To motivate what this (rather abstract) definition means, let's consider
+the following examples:
+
+\subsection{Example: Tossing a Coin}\label{example-tossing-a-coin}
+
+Let's formally define a fair coin toss. A fair coin can land on heads
+(\(H\)) or tails (\(T\)), each with a probability of 0.5. With these
+possible outcomes, we can define a random variable \(X\) as:
+\[X = \begin{cases} 
+      1, \text{if the coin lands heads} \\
+      0, \text{if the coin lands tails} 
+   \end{cases}\]
+
+\(X\) is a function with a domain, or input, of \(\{H, T\}\) and a
+range, or output, of \(\{1, 0\}\). In practice, while we don't use the
+following function notation, you could write the above as
+\[X = \begin{cases}  X(H) = 1 \\ X(T) = 0 \end{cases}\]
+
+\subsection{Example: Sampling Data 100
+Students}\label{example-sampling-data-100-students}
+
+Suppose we draw a random sample \(s\) of size 3 from all students
+enrolled in Data 100.
+
+We can define \(Y\) as the number of data science students in our
+sample. Its domain is all possible samples of size 3, and its range is
+\(\{0, 1, 2, 3\}\).
+
+Note that we can use random variables in mathematical expressions to
+create new random variables.
+
+For example, let's say we sample 3 students at random from lecture and
+look at their midterm scores. Let \(X_1\), \(X_2\), and \(X_3\)
+represent each student's midterm grade.
+
+We can use these random variables to create a new random variable,
+\(Y\), which represents the average of the 3 scores:
+\(Y = (X_1 + X_2 + X_3)/3\).
+
+As we're creating this random variable, a few questions arise:
+
+\begin{itemize}
+\tightlist
+\item
+  What can we say about the distribution of \(Y\)?
+\item
+  How does it depend on the distribution of \(X_1\), \(X_2\), and
+  \(X_3\)?
+\end{itemize}
+
+But, what exactly is a distribution? Let's dive into this!
+
+\subsection{Distributions}\label{distributions}
+
+To define any random variable \(X\), we need to be able to specify 2
+things:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \textbf{Possible values}: the set of values the random variable can
+  take on.
+\item
+  \textbf{Probabilities}: the set of probabilities describing how the
+  total probability of 100\% is split over the possible values.
+\end{enumerate}
+
+If \(X\) is discrete (has a finite number of possible values), the
+probability that a random variable \(X\) takes on the value \(x\) is
+given by \(P(X=x)\), and probabilities must sum to 1:
+\(\sum_{\text{all } x} P(X=x) = 1\),
+
+We can often display this using a \textbf{probability distribution
+table}. In the coin toss example, the probability distribution table of
+\(X\) is given by.
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+\(x\) & \(P(X=x)\) \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & \(\frac{1}{2}\) \\
+1 & \(\frac{1}{2}\) \\
+\end{longtable}
+
+The \textbf{distribution} of a random variable \(X\) describes how the
+total probability of 100\% is split across all the possible values of
+\(X\), and it fully defines a random variable. If you know the
+distribution of a random variable you can:
+
+\begin{itemize}
+\tightlist
+\item
+  compute properties of the random variables and derived variables
+\item
+  simulate the random variables by randomly picking values of \(X\)
+  according to its distribution using \texttt{np.random.choice},
+  \texttt{df.sample}, or
+  \texttt{scipy.stats.\textless{}dist\textgreater{}.rvs(...)}
+\end{itemize}
+
+The distribution of a discrete random variable can also be represented
+using a histogram. If a variable is \textbf{continuous}, meaning it can
+take on infinitely many values, we can illustrate its distribution using
+a density curve.
+
+We often don't know the (true) distribution and instead compute an
+empirical distribution. If you flip a coin 3 times and get \{H, H, T\},
+you may ask ------ what is the probability that the coin will land
+heads? We can come up with an \textbf{empirical estimate} of
+\(\frac{2}{3}\), though the true probability might be \(\frac{1}{2}\).
+
+Probabilities are areas. For discrete random variables, the \emph{area
+of the red bars} represents the probability that a discrete random
+variable \(X\) falls within those values. For continuous random
+variables, the \emph{area under the curve} represents the probability
+that a discrete random variable \(Y\) falls within those values.
+
+If we sum up the total area of the bars/under the density curve, we
+should get 100\%, or 1.
+
+We can show the distribution of \(Y\) in the following tables. The table
+on the left lists all possible samples of \(s\) and the number of times
+they can appear (\(Y(s)\)). We can use this to calculate the values for
+the table on the right, a \textbf{probability distribution table}.
+
+Rather than fully write out a probability distribution or show a
+histogram, there are some common distributions that come up frequently
+when doing data science. These distributions are specified by some
+\textbf{parameters}, which are constants that specify the shape of the
+distribution. In terms of notation, the `\textasciitilde{}' means ``has
+the probability distribution of''.
+
+These common distributions are listed below:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Bernoulli(\(p\)): If \(X\) \textasciitilde{} Bernoulli(\(p\)), then
+  \(X\) takes on a value 1 with probability \(p\), and 0 with
+  probability \(1 - p\). Bernoulli random variables are also termed the
+  ``indicator'' random variables.
+\item
+  Binomial(\(n\), \(p\)): If \(X\) \textasciitilde{} Binomial(\(n\),
+  \(p\)), then \(X\) counts the number of 1s in \(n\) independent
+  Bernoulli(\(p\)) trials.
+\item
+  Categorical(\(p_1, ..., p_k\)) of values: The probability of each
+  value is 1 / (number of possible values).
+\item
+  Uniform on the unit interval (0, 1): The density is flat at 1 on (0,
+  1) and 0 elsewhere. We won't get into what density means as much here,
+  but intuitively, this is saying that there's an equally likely chance
+  of getting any value on the interval (0, 1).
+\item
+  Normal(\(\mu\), \(\sigma^2\)): The probability density is specified by
+  \(\frac{1}{\sqrt{2\pi}}e^{-\frac{1}{2}\frac{(x-\mu)^2}{\sigma^2}}\).
+  This bell-shaped distribution comes up fairly often in data, in part
+  due to the Central Limit Theorem you saw back in Data 8.
+\end{enumerate}
+
+\section{Expectation and Variance}\label{expectation-and-variance}
+
+There are several ways to describe a random variable. The methods shown
+above ------ a table of all samples \(s, X(s)\), distribution table
+\(P(X=x)\), and histograms ------ are all definitions that \emph{fully
+describe} a random variable. Often, it is easier to describe a random
+variable using some \emph{numerical summary} rather than fully defining
+its distribution. These numerical summaries are numbers that
+characterize some properties of the random variable. Because they give a
+``summary'' of how the variable tends to behave, they are \emph{not}
+random. Instead, think of them as a static number that describes a
+certain property of the random variable. In Data 100, we will focus our
+attention on the expectation and variance of a random variable.
+
+\subsection{Expectation}\label{expectation}
+
+The \textbf{expectation} of a random variable \(X\) is the
+\textbf{weighted average} of the values of \(X\), where the weights are
+the probabilities of each value occurring. There are two equivalent ways
+to compute the expectation:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Apply the weights one \emph{sample} at a time:
+  \[\mathbb{E}[X] = \sum_{\text{all possible } s} X(s) P(s)\].
+\item
+  Apply the weights one possible \emph{value} at a time:
+  \[\mathbb{E}[X] = \sum_{\text{all possible } x} x P(X=x)\]
+\end{enumerate}
+
+The latter is more commonly used as we are usually just given the
+distribution, not all possible samples.
+
+We want to emphasize that the expectation is a \emph{number}, not a
+random variable. Expectation is a generalization of the average, and it
+has the same units as the random variable. It is also the center of
+gravity of the probability distribution histogram, meaning if we
+simulate the variable many times, it is the long-run average of the
+simulated values.
+
+\subsubsection{Example 1: Coin Toss}\label{example-1-coin-toss}
+
+Going back to our coin toss example, we define a random variable \(X\)
+as: \[X = \begin{cases} 
+      1, \text{if the coin lands heads} \\
+      0, \text{if the coin lands tails} 
+   \end{cases}\]
+
+We can calculate its expectation \(\mathbb{E}[X]\) using the second
+method of applying the weights one possible value at a time:
+\[\begin{align}
+ \mathbb{E}[X] &= \sum_{x} x P(X=x) \\
+ &= 1 * 0.5 + 0 * 0.5 \\
+ &= 0.5
+\end{align}\]
+
+Note that \(\mathbb{E}[X] = 0.5\) is not a possible value of \(X\); it's
+an average. \textbf{The expectation of X does not need to be a possible
+value of X}.
+
+\subsubsection{Example 2}\label{example-2-1}
+
+Consider the random variable \(X\):
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+\(x\) & \(P(X=x)\) \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+3 & 0.1 \\
+4 & 0.2 \\
+6 & 0.4 \\
+8 & 0.3 \\
+\end{longtable}
+
+To calculate it's expectation, \[\begin{align}
+ \mathbb{E}[X] &= \sum_{x} x P(X=x) \\
+ &= 3 * 0.1 + 4 * 0.2 + 6 * 0.4 + 8 * 0.3 \\
+ &= 0.3 + 0.8 + 2.4 + 2.4 \\
+ &= 5.9
+\end{align}\]
+
+Again, note that \(\mathbb{E}[X] = 5.9\) is not a possible value of
+\(X\); it's an average. \textbf{The expectation of X does not need to be
+a possible value of X}.
+
+\subsection{Variance}\label{variance}
+
+The \textbf{variance} of a random variable is a measure of its chance
+error. It is defined as the expected squared deviation from the
+expectation of \(X\). Put more simply, variance asks: how far does \(X\)
+typically vary from its average value, just by chance? What is the
+spread of \(X\)'s distribution?
+
+\[\text{Var}(X) = \mathbb{E}[(X-\mathbb{E}[X])^2]\]
+
+The units of variance are the square of the units of \(X\). To get it
+back to the right scale, use the standard deviation of \(X\):
+\[\text{SD}(X) = \sqrt{\text{Var}(X)}\]
+
+Like with expectation, \textbf{variance and standard deviation are
+numbers, not random variables}! Variance helps us describe the
+variability of a random variable. It is the expected squared error
+between the random variable and its expected value. As you will see
+shortly, we can use variance to help us quantify the chance error that
+arises when using a sample \(X\) to estimate the population mean.
+
+By
+\href{https://www.inferentialthinking.com/chapters/14/2/Variability.html\#Chebychev's-Bounds}{Chebyshev's
+inequality}, which you saw in Data 8, no matter what the shape of the
+distribution of \(X\) is, the vast majority of the probability lies in
+the interval ``expectation plus or minus a few SDs.''
+
+If we expand the square and use properties of expectation, we can
+re-express variance as the \textbf{computational formula for variance}.
+
+\[\text{Var}(X) = \mathbb{E}[X^2] - (\mathbb{E}[X])^2\]
+
+This form is often more convenient to use when computing the variance of
+a variable by hand, and it is also useful in Mean Squared Error
+calculations, as \(\mathbb{E}[X^2] = \text{Var}(X)\) if \(X\) is
+centered and \(E(X)=0\).
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\[\begin{align}
+   \text{Var}(X) &= \mathbb{E}[(X-\mathbb{E}[X])^2] \\
+   &= \mathbb{E}(X^2 - 2X\mathbb{E}(X) + (\mathbb{E}(X))^2) \\
+   &= \mathbb{E}(X^2) - 2 \mathbb{E}(X)\mathbb{E}(X) +( \mathbb{E}(X))^2\\
+   &= \mathbb{E}[X^2] - (\mathbb{E}[X])^2
+\end{align}\]
+
+\end{tcolorbox}
+
+How do we compute \(\mathbb{E}[X^2]\)? Any function of a random variable
+is \emph{also} a random variable. That means that by squaring \(X\),
+we've created a new random variable. To compute \(\mathbb{E}[X^2]\), we
+can simply apply our definition of expectation to the random variable
+\(X^2\).
+
+\[\mathbb{E}[X^2] = \sum_{x} x^2 P(X = x)\]
+
+\subsection{Example: Die}\label{example-die}
+
+Let \(X\) be the outcome of a single fair die roll. \(X\) is a random
+variable defined as \[X = \begin{cases} 
+      \frac{1}{6}, \text{if } x \in \{1,2,3,4,5,6\} \\
+      0, \text{otherwise} 
+   \end{cases}\]
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-caution-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-caution-color}{\faFire}\hspace{0.5em}{What's the expectation, \(\mathbb{E}[X]?\)}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-caution-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\[ \begin{align} 
+         \mathbb{E}[X] &= 1\big(\frac{1}{6}\big) + 2\big(\frac{1}{6}\big) + 3\big(\frac{1}{6}\big) + 4\big(\frac{1}{6}\big) + 5\big(\frac{1}{6}\big) + 6\big(\frac{1}{6}\big) \\
+         &= \big(\frac{1}{6}\big)( 1 + 2 + 3 + 4 + 5 + 6) \\
+         &= \frac{7}{2}
+      \end{align}\]
+
+\end{tcolorbox}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-caution-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-caution-color}{\faFire}\hspace{0.5em}{What's the variance, \(\text{Var}(X)?\)}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-caution-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+Using Approach 1 (definition): \[\begin{align} 
+      \text{Var}(X) &= \big(\frac{1}{6}\big)((1 - \frac{7}{2})^2 + (2 - \frac{7}{2})^2 + (3 - \frac{7}{2})^2 + (4 - \frac{7}{2})^2 + (5 - \frac{7}{2})^2 + (6 - \frac{7}{2})^2) \\
+      &= \frac{35}{12}
+   \end{align}\]
+
+Using Approach 2 (property):
+\[\mathbb{E}[X^2] = \sum_{x} x^2 P(X = x) = \frac{91}{6}\]
+\[\text{Var}(X) = \frac{91}{6} - (\frac{7}{2})^2 = \frac{35}{12}\]
+
+\end{tcolorbox}
+
+We can summarize our discussion so far in the following diagram:
+
+\section{Sums of Random Variables}\label{sums-of-random-variables}
+
+Often, we will work with multiple random variables at the same time. A
+function of a random variable is also a random variable. If you create
+multiple random variables based on your sample, then functions of those
+random variables are also random variables.
+
+For example, if \(X_1, X_2, ..., X_n\) are random variables, then so are
+all of these:
+
+\begin{itemize}
+\tightlist
+\item
+  \(X_n^2\)
+\item
+  \(\#\{i : X_i > 10\}\)
+\item
+  \(\text{max}(X_1, X_2, ..., X_n)\)
+\item
+  \(\frac{1}{n} \sum_{i=1}^n (X_i - c)^2\)
+\item
+  \(\frac{1}{n} \sum_{i=1}^n X_i\)
+\end{itemize}
+
+Many functions of random variables that we are interested in (e.g.,
+counts, means) involve sums of random variables, so let's dive deeper
+into the properties of sums of random variables.
+
+\subsection{Properties of Expectation}\label{properties-of-expectation}
+
+Instead of simulating full distributions, we often just compute
+expectation and variance directly. Recall the definition of expectation:
+\[\mathbb{E}[X] = \sum_{x} x P(X=x)\]
+
+From it, we can derive some useful properties:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \textbf{Linearity of expectation}. The expectation of the linear
+  transformation \(aX+b\), where \(a\) and \(b\) are constants, is:
+\end{enumerate}
+
+\[\mathbb{E}[aX+b] = aE[\mathbb{X}] + b\]
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\[\begin{align}
+        \mathbb{E}[aX+b] &= \sum_{x} (ax + b) P(X=x) \\
+        &= \sum_{x} (ax P(X=x) + bP(X=x)) \\
+        &= a\sum_{x}P(X=x) + b\sum_{x}P(X=x)\\
+        &= a\mathbb{E}(X) + b * 1
+    \end{align}\]
+
+\end{tcolorbox}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{1}
+\tightlist
+\item
+  Expectation is also linear in \emph{sums} of random variables.
+\end{enumerate}
+
+\[\mathbb{E}[X+Y] = \mathbb{E}[X] + \mathbb{E}[Y]\]
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\[\begin{align}
+    \mathbb{E}[X+Y] &= \sum_{s} (X+Y)(s) P(s) \\
+    &= \sum_{s} (X(s)P(s) + Y(s)P(s)) \\
+    &= \sum_{s} X(s)P(s) + \sum_{s} Y(s)P(s)\\
+    &= \mathbb{E}[X] + \mathbb{E}[Y]
+\end{align}\]
+
+\end{tcolorbox}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{2}
+\tightlist
+\item
+  If \(g\) is a non-linear function, then in general,
+  \[\mathbb{E}[g(X)] \neq g(\mathbb{E}[X])\] For example, if \(X\) is -1
+  or 1 with equal probability, then \(\mathbb{E}[X] = 0\), but
+  \(\mathbb{E}[X^2] = 1 \neq 0\).
+\end{enumerate}
+
+\subsection{Properties of Variance}\label{properties-of-variance}
+
+Let's now get into the properties of variance. Recall the definition of
+variance: \[\text{Var}(X) = \mathbb{E}[(X-\mathbb{E}[X])^2]\]
+
+Combining it with the properties of expectation, we can derive some
+useful properties:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Unlike expectation, variance is \emph{non-linear}. The variance of the
+  linear transformation \(aX+b\) is:
+  \[\text{Var}(aX+b) = a^2 \text{Var}(X)\]
+\end{enumerate}
+
+\begin{itemize}
+\tightlist
+\item
+  Subsequently, \[\text{SD}(aX+b) = |a| \text{SD}(X)\]
+\item
+  The full proof of this fact can be found using the definition of
+  variance. As general intuition, consider that \(aX+b\) scales the
+  variable \(X\) by a factor of \(a\), then shifts the distribution of
+  \(X\) by \(b\) units.
+\end{itemize}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+We know that \[\mathbb{E}[aX+b] = aE[\mathbb{X}] + b\]
+
+In order to compute \(\text{Var}(aX+b)\), consider that a shift by \(b\)
+units does not affect spread, so \(\text{Var}(aX+b) = \text{Var}(aX)\).
+
+Then, \[\begin{align}
+    \text{Var}(aX+b) &= \text{Var}(aX) \\
+    &= E((aX)^2) - (E(aX))^2 \\
+    &= E(a^2 X^2) - (aE(X))^2\\
+    &= a^2 (E(X^2) - (E(X))^2) \\
+    &= a^2 \text{Var}(X)
+\end{align}\]
+
+\end{tcolorbox}
+
+\begin{itemize}
+\tightlist
+\item
+  Shifting the distribution by \(b\) \emph{does not} impact the
+  \emph{spread} of the distribution. Thus,
+  \(\text{Var}(aX+b) = \text{Var}(aX)\).
+\item
+  Scaling the distribution by \(a\) \emph{does} impact the spread of the
+  distribution.
+\end{itemize}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{1}
+\tightlist
+\item
+  Variance of sums of random variables is affected by the (in)dependence
+  of the random variables.
+  \[\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) + 2\text{cov}(X,Y)\]
+  \[\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) \qquad \text{if } X, Y \text{ independent}\]
+\end{enumerate}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Proof}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+The variance of a sum is affected by the dependence between the two
+random variables that are being added. Let's expand the definition of
+\(\text{Var}(X + Y)\) to see what's going on.
+
+To simplify the math, let \(\mu_x = \mathbb{E}[X]\) and
+\(\mu_y = \mathbb{E}[Y]\).
+
+\[ \begin{align}
+\text{Var}(X + Y) &= \mathbb{E}[(X+Y- \mathbb{E}(X+Y))^2] \\
+&= \mathbb{E}[((X - \mu_x) + (Y - \mu_y))^2] \\
+&= \mathbb{E}[(X - \mu_x)^2 + 2(X - \mu_x)(Y - \mu_y) + (Y - \mu_y)^2] \\
+&= \mathbb{E}[(X - \mu_x)^2] + \mathbb{E}[(Y - \mu_y)^2] + \mathbb{E}[(X - \mu_x)(Y - \mu_y)] \\
+&= \text{Var}(X) + \text{Var}(Y) + \mathbb{E}[(X - \mu_x)(Y - \mu_y)] 
+\end{align}\]
+
+\end{tcolorbox}
+
+\subsection{Covariance and
+Correlation}\label{covariance-and-correlation}
+
+We define the \textbf{covariance} of two random variables as the
+expected product of deviations from expectation. Put more simply,
+covariance is a generalization of variance to variance:
+
+\[\text{Cov}(X, X) = \mathbb{E}[(X - \mathbb{E}[X])^2] = \text{Var}(X)\]
+
+\[\text{Cov}(X, Y) = \mathbb{E}[(X - \mathbb{E}[X])(Y - \mathbb{E}[Y])]\]
+
+We can treat the covariance as a measure of association. Remember the
+definition of correlation given when we first established SLR?
+
+\[r(X, Y) = \mathbb{E}\left[\left(\frac{X-\mathbb{E}[X]}{\text{SD}(X)}\right)\left(\frac{Y-\mathbb{E}[Y]}{\text{SD}(Y)}\right)\right] = \frac{\text{Cov}(X, Y)}{\text{SD}(X)\text{SD}(Y)}\]
+
+It turns out we've been quietly using covariance for some time now! If
+\(X\) and \(Y\) are independent, then \(\text{Cov}(X, Y) =0\) and
+\(r(X, Y) = 0\). Note, however, that the converse is not always true:
+\(X\) and \(Y\) could have \(\text{Cov}(X, Y) = r(X, Y) = 0\) but not be
+independent.
+
+\subsection{Equal vs.~Identically Distributed
+vs.~i.i.d}\label{equal-vs.-identically-distributed-vs.-i.i.d}
+
+Suppose that we have two random variables \(X\) and \(Y\):
+
+\begin{itemize}
+\tightlist
+\item
+  \(X\) and \(Y\) are \textbf{equal} if \(X(s) = Y(s)\) for every sample
+  \(s\). Regardless of the exact sample drawn, \(X\) is always equal to
+  \(Y\).
+\item
+  \(X\) and \(Y\) are \textbf{identically distributed} if the
+  distribution of \(X\) is equal to the distribution of \(Y\). We say
+  ``\(X\) and \(Y\) are equal in distribution.'' That is, \(X\) and
+  \(Y\) take on the same set of possible values, and each of these
+  possible values is taken with the same probability. On any specific
+  sample \(s\), identically distributed variables do \emph{not}
+  necessarily share the same value. If \(X = Y\), then \(X\) and \(Y\)
+  are identically distributed; however, the converse is not true (ex:
+  \(Y = 7 - X\), \(X\) is a die)
+\item
+  \(X\) and \(Y\) are \textbf{independent and identically distributed
+  (i.i.d)} if
+
+  \begin{enumerate}
+  \def\labelenumi{\arabic{enumi}.}
+  \tightlist
+  \item
+    The variables are identically distributed.
+  \item
+    Knowing the outcome of one variable does not influence our belief of
+    the outcome of the other.
+  \end{enumerate}
+\end{itemize}
+
+Note that in Data 100, you'll never be expected to prove that random
+variables are i.i.d.
+
+Now let's walk through an example. Say \(X_1\) and \(X_2\) be numbers on
+rolls of two fair die. \(X_1\) and \(X_2\) are i.i.d, so \(X_1\) and
+\(X_2\) have the same distribution. However, the sums
+\(Y = X_1 + X_1 = 2X_1\) and \(Z=X_1+X_2\) have different distributions
+but the same expectation.
+
+However, \(Y = X_1\) has a larger variance.
+
+\subsection{Example: Bernoulli Random
+Variable}\label{example-bernoulli-random-variable}
+
+To get some practice with the formulas discussed so far, let's derive
+the expectation and variance for a Bernoulli(\(p\)) random variable. If
+\(X\) \textasciitilde{} Bernoulli(\(p\)),
+
+\(\mathbb{E}[X] = 1 \cdot p + 0 \cdot (1 - p) = p\)
+
+To compute the variance, we will use the computational formula. We first
+find that: \(\mathbb{E}[X^2] = 1^2 \cdot p + 0^2 \cdot (1 - p) = p\)
+
+From there, let's calculate our variance:
+\(\text{Var}(X) = \mathbb{E}[X^2] - \mathbb{E}[X]^2 = p - p^2 = p(1-p)\)
+
+\subsection{Example: Binomial Random
+Variable}\label{example-binomial-random-variable}
+
+Let \(Y\) \textasciitilde{} Binomial(\(n\), \(p\)). We can think of
+\(Y\) as being the sum of \(n\) i.i.d. Bernoulli(\(p\)) random
+variables. Mathematically, this translates to
+
+\[Y = \sum_{i=1}^n X_i\]
+
+where \(X_i\) is the indicator of a success on trial \(i\).
+
+Using linearity of expectation,
+
+\[\mathbb{E}[Y] = \sum_{i=1}^n \mathbb{E}[X_i] = np\]
+
+For the variance, since each \(X_i\) is independent of the other,
+\(\text{Cov}(X_i, X_j) = 0\),
+
+\[\text{Var}(Y) =  \sum_{i=1}^n \text{Var}[X_i] = np(1-p)\]
+
+\subsection{Summary}\label{summary-2}
+
+\begin{itemize}
+\tightlist
+\item
+  Let \(X\) be a random variable with distribution \(P(X=x)\).
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \(\mathbb{E}[X] = \sum_{x} x P(X=x)\)
+  \item
+    \(\text{Var}(X) = \mathbb{E}[(X-\mathbb{E}[X])^2] = \mathbb{E}[X^2] - (\mathbb{E}[X])^2\)
+  \end{itemize}
+\item
+  Let \(a\) and \(b\) be scalar values.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \(\mathbb{E}[aX+b] = aE[\mathbb{X}] + b\)
+  \item
+    \(\text{Var}(aX+b) = a^2 \text{Var}(X)\)
+  \end{itemize}
+\item
+  Let \(Y\) be another random variable.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \(\mathbb{E}[X+Y] = \mathbb{E}[X] + \mathbb{E}[Y]\)
+  \item
+    \(\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y) + 2\text{Cov}(X,Y)\)
+  \end{itemize}
+\end{itemize}
+
+Note that \(\text{Cov}(X,Y)\) would equal 0 if \(X\) and \(Y\) are
+independent.
+
+\bookmarksetup{startatroot}
+
+\chapter{Estimators, Bias, and
+Variance}\label{estimators-bias-and-variance}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Explore commonly seen random variables like Bernoulli and Binomial
+  distributions
+\item
+  Apply the Central Limit Theorem to approximate parameters of a
+  population
+\item
+  Use sampled data to model an estimation of and infer the true
+  underlying distribution
+\item
+  Estimate the true population distribution from a sample using the
+  bootstrapping technique
+\end{itemize}
+
+\end{tcolorbox}
+
+Last time, we introduced the idea of random variables: numerical
+functions of a sample. Most of our work in the last lecture was done to
+build a background in probability and statistics. Now that we've
+established some key ideas, we're in a good place to apply what we've
+learned to our original goal -- understanding how the randomness of a
+sample impacts the model design process.
+
+In this lecture, we will delve more deeply into the idea of fitting a
+model to a sample. We'll explore how to re-express our modeling process
+in terms of random variables and use this new understanding to steer
+model complexity.
+
+\section{Common Random Variables}\label{common-random-variables}
+
+There are several cases of random variables that appear often and have
+useful properties. Below are the ones we will explore further in this
+course. The numbers in parentheses are the parameters of a random
+variable, which are constants. Parameters define a random variable's
+shape (i.e., distribution) and its values. For this lecture, we'll focus
+more heavily on the bolded random variables and their special
+properties, but you should familiarize yourself with all the ones listed
+below:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Bernoulli(\(p\))}
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Takes on value 1 with probability \(p\), and 0 with probability
+    \((1 - p)\).
+  \item
+    AKA the ``indicator'' random variable.
+  \item
+    Let \(X\) be a Bernoulli(\(p\)) random variable.
+
+    \begin{itemize}
+    \tightlist
+    \item
+      \(\mathbb{E}[X] = 1 * p + 0 * (1-p) = p\)
+
+      \begin{itemize}
+      \tightlist
+      \item
+        \(\mathbb{E}[X^2] = 1^2 * p + 0 * (1-p) = p\)
+      \end{itemize}
+    \item
+      \(\text{Var}(X) = \mathbb{E}[X^2] - (\mathbb{E}[X])^2 = p - p^2 = p(1-p)\)
+    \end{itemize}
+  \end{itemize}
+\item
+  \textbf{Binomial(\(n\), \(p\))}
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Number of 1s in \(n\) independent Bernoulli(\(p\)) trials.
+  \item
+    Let \(Y\) be a Binomial(\(n\), \(p\)) random variable.
+
+    \begin{itemize}
+    \tightlist
+    \item
+      The distribution of \(Y\) is given by the binomial formula, and we
+      can write \(Y = \sum_{i=1}^n X_i\) where:
+
+      \begin{itemize}
+      \tightlist
+      \item
+        \(X_i\) s the indicator of success on trial i. \(X_i = 1\) if
+        trial i is a success, else 0.
+      \item
+        All \(X_i\) are i.i.d. and Bernoulli(\(p\)).
+      \end{itemize}
+    \item
+      \(\mathbb{E}[Y] = \sum_{i=1}^n \mathbb{E}[X_i] = np\)
+    \item
+      \(\text{Var}(X) = \sum_{i=1}^n \text{Var}(X_i) = np(1-p)\)
+
+      \begin{itemize}
+      \tightlist
+      \item
+        \(X_i\)'s are independent, so \(\text{Cov}(X_i, X_j) = 0\) for
+        all i, j.
+      \end{itemize}
+    \end{itemize}
+  \end{itemize}
+\item
+  Uniform on a finite set of values
+
+  \begin{itemize}
+  \tightlist
+  \item
+    The probability of each value is
+    \(\frac{1}{\text{(number of possible values)}}\).
+  \item
+    For example, a standard/fair die.
+  \end{itemize}
+\item
+  Uniform on the unit interval (0, 1)
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Density is flat at 1 on (0, 1) and 0 elsewhere.
+  \end{itemize}
+\item
+  Normal(\(\mu, \sigma^2\)), a.k.a Gaussian
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \(f(x) = \frac{1}{\sigma\sqrt{2\pi}} \exp\left( -\frac{1}{2}\left(\frac{x-\mu}{\sigma}\right)^{\!2}\,\right)\)
+  \end{itemize}
+\end{itemize}
+
+\subsection{Example}\label{example}
+
+Suppose you win cash based on the number of heads you get in a series of
+20 coin flips. Let \(X_i = 1\) if the \(i\)-th coin is heads, \(0\)
+otherwise. Which payout strategy would you choose?
+
+A. \(Y_A = 10 * X_1 + 10 * X_2\)
+
+B. \(Y_B = \sum_{i=1}^{20} X_i\)
+
+C. \(Y_C = 20 * X_1\)
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-caution-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-caution-color}{\faFire}\hspace{0.5em}{Solution}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-caution-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+Let \(X_1, X_2, ... X_{20}\) be 20 i.i.d Bernoulli(0.5) random
+variables. Since the \(X_i\)'s are independent,
+\(\text{Cov}(X_i, X_j) = 0\) for all pairs \(i, j\). Additionally, Since
+\(X_i\) is Bernoulli(0.5), we know that \(\mathbb{E}[X] = p = 0.5\) and
+\(\text{Var}(X) = p(1-p) = 0.25\). We can calculate the following for
+each scenario:
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.2500}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.2500}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.2500}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.2500}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+A. \(Y_A = 10 * X_1 + 10 * X_2\)
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+B. \(Y_B = \sum_{i=1}^{20} X_i\)
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+C. \(Y_C = 20 * X_1\)
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+Expectation & \(\mathbb{E}[Y_A] = 10 (0.5) + 10(0.5) = 10\) &
+\(\mathbb{E}[Y_B] = 0.5 + ... + 0.5 = 10\) &
+\(\mathbb{E}[Y_C] = 20(0.5) = 10\) \\
+Variance & \(\text{Var}(Y_A) = 10^2 (0.25) + 10^2 (0.25) = 50\) &
+\(\text{Var}(Y_B) = 0.25 + ... + 0.25 = 5\) &
+\(\text{Var}(Y_C) = 20^2 (0.25) = 100\) \\
+Standard Deviation & \(\text{SD}(Y_A) \approx 7.07\) &
+\(\text{SD}(Y_B) \approx 2.24\) & \(\text{SD}(Y_C) = 10\) \\
+\end{longtable}
+
+As we can see, all the scenarios have the same expected value but
+different variances. The higher the variance, the greater the risk and
+uncertainty, so the ``right'' strategy depends on your personal
+preference. Would you choose the ``safest'' option B, the most ``risky''
+option C, or somewhere in the middle (option A)?
+
+\end{tcolorbox}
+
+\section{Sample Statistics}\label{sample-statistics}
+
+Today, we've talked extensively about populations; if we know the
+distribution of a random variable, we can reliably compute expectation,
+variance, functions of the random variable, etc. Note that:
+
+\begin{itemize}
+\tightlist
+\item
+  The distribution of a \emph{population} describes how a random
+  variable behaves across \emph{all} individuals of interest.
+\item
+  The distribution of a \emph{sample} describes how a random variable
+  behaves in a \emph{specific sample} from the population.
+\end{itemize}
+
+In Data Science, however, we often do not have access to the whole
+population, so we don't know its distribution. As such, we need to
+collect a sample and use its distribution to estimate or infer
+properties of the population. In cases like these, we can take several
+samples of size \(n\) from the population (an easy way to do this is
+using \texttt{df.sample(n,\ replace=True)}), and compute the mean of
+each \emph{sample}. When sampling, we make the (big) assumption that we
+sample uniformly at random \emph{with replacement} from the population;
+each observation in our sample is a random variable drawn i.i.d from our
+population distribution. Remember that our sample mean is a random
+variable since it depends on our randomly drawn sample! On the other
+hand, our population mean is simply a number (a fixed value).
+
+\subsection{Sample Mean}\label{sample-mean}
+
+Consider an i.i.d. sample \(X_1, X_2, ..., X_n\) drawn from a population
+with mean 𝜇 and SD 𝜎. We define the sample mean as
+\[\bar{X}_n = \frac{1}{n} \sum_{i=1}^n X_i\]
+
+The expectation of the sample mean is given by: \[\begin{align} 
+    \mathbb{E}[\bar{X}_n] &= \frac{1}{n} \sum_{i=1}^n \mathbb{E}[X_i] \\
+    &= \frac{1}{n} (n \mu) \\
+    &= \mu 
+\end{align}\]
+
+The variance is given by: \[\begin{align} 
+    \text{Var}(\bar{X}_n) &= \frac{1}{n^2} \text{Var}( \sum_{i=1}^n X_i) \\
+    &=  \frac{1}{n^2} \left( \sum_{i=1}^n \text{Var}(X_i) \right) \\
+    &=  \frac{1}{n^2} (n \sigma^2) = \frac{\sigma^2}{n}
+\end{align}\]
+
+\(\bar{X}_n\) is approximately normally distributed by the Central Limit
+Theorem (CLT).
+
+\subsection{Central Limit Theorem}\label{central-limit-theorem}
+
+In
+\href{https://inferentialthinking.com/chapters/14/4/Central_Limit_Theorem.html?}{Data
+8} and in the previous lecture, you encountered the \textbf{Central
+Limit Theorem (CLT)}. This is a powerful theorem for estimating the
+distribution of a population with mean \(\mu\) and standard deviation
+\(\sigma\) from a collection of smaller samples. The CLT tells us that
+if an i.i.d sample of size \(n\) is large, then the probability
+distribution of the \textbf{sample mean} is \textbf{roughly normal} with
+mean \(\mu\) and SD of \(\frac{\sigma}{\sqrt{n}}\). More generally, any
+theorem that provides the rough distribution of a statistic and
+\textbf{doesn't need the distribution of the population} is valuable to
+data scientists! This is because we rarely know a lot about the
+population.
+
+Importantly, the CLT assumes that each observation in our samples is
+drawn i.i.d from the distribution of the population. In addition, the
+CLT is accurate only when \(n\) is ``large'', but what counts as a
+``large'' sample size depends on the specific distribution. If a
+population is highly symmetric and unimodal, we could need as few as
+\(n=20\); if a population is very skewed, we need a larger \(n\). If in
+doubt, you can bootstrap the sample mean and see if the bootstrapped
+distribution is bell-shaped. Classes like Data 140 investigate this idea
+in great detail.
+
+For a more in-depth demo, check out
+\href{https://onlinestatbook.com/stat_sim/sampling_dist/}{onlinestatbook}.
+
+\subsection{Using the Sample Mean to Estimate the Population
+Mean}\label{using-the-sample-mean-to-estimate-the-population-mean}
+
+Now let's say we want to use the sample mean to \textbf{estimate} the
+population mean, for example, the average height of Cal undergraduates.
+We can typically collect a \textbf{single sample}, which has just one
+average. However, what if we happened, by random chance, to draw a
+sample with a different mean or spread than that of the population? We
+might get a skewed view of how the population behaves (consider the
+extreme case where we happen to sample the exact same value \(n\)
+times!).
+
+For example, notice the difference in variation between these two
+distributions that are different in sample size. The distribution with a
+bigger sample size (\(n=800\)) is tighter around the mean than the
+distribution with a smaller sample size (\(n=200\)). Try plugging in
+these values into the standard deviation equation for the sample mean to
+make sense of this!
+
+Applying the CLT allows us to make sense of all of this and resolve this
+issue. By drawing many samples, we can consider how the sample
+distribution varies across multiple subsets of the data. This allows us
+to approximate the properties of the population without the need to
+survey every single member.
+
+Given this potential variance, it is also important that we consider the
+\textbf{average value and spread} of all possible sample means, and what
+this means for how big \(n\) should be. For every sample size, the
+expected value of the sample mean is the population mean:
+\[\mathbb{E}[\bar{X}_n] = \mu\] We call the sample mean an
+\textbf{unbiased estimator} of the population mean and will explore this
+idea more in the next lecture.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Data 8 Recap: Square Root Law}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+The square root law
+(\href{https://inferentialthinking.com/chapters/14/5/Variability_of_the_Sample_Mean.html\#the-square-root-law}{Data
+8}) states that if you increase the sample size by a factor, the SD of
+the sample mean decreases by the square root of the factor.
+\[\text{SD}(\bar{X_n}) = \frac{\sigma}{\sqrt{n}}\] The sample mean is
+more likely to be close to the population mean if we have a larger
+sample size.
+
+\end{tcolorbox}
+
+\section{Prediction and Inference}\label{prediction-and-inference-1}
+
+At this point in the course, we've spent a great deal of time working
+with models. When we first introduced the idea of modeling a few weeks
+ago, we did so in the context of \textbf{prediction}: using models to
+make \emph{accurate predictions} about unseen data. Another reason we
+might build models is to better understand complex phenomena in the
+world around us. \textbf{Inference} is the task of using a model to
+infer the true underlying relationships between the feature and response
+variables. For example, if we are working with a set of housing data,
+\emph{prediction} might ask: given the attributes of a house, how much
+is it worth? \emph{Inference} might ask: how much does having a local
+park impact the value of a house?
+
+A major goal of inference is to draw conclusions about the full
+population of data given only a random sample. To do this, we aim to
+estimate the value of a \textbf{parameter}, which is a numerical
+function of the \emph{population} (for example, the population mean
+\(\mu\)). We use a collected sample to construct a \textbf{statistic},
+which is a numerical function of the random \emph{sample} (for example,
+the sample mean \(\bar{X}_n\)). It's helpful to think ``p'' for
+``parameter'' and ``population,'' and ``s'' for ``sample'' and
+``statistic.''
+
+Since the sample represents a \emph{random} subset of the population,
+any statistic we generate will likely deviate from the true population
+parameter, and it \emph{could have been different}. We say that the
+sample statistic is an \textbf{estimator} of the true population
+parameter. Notationally, the population parameter is typically called
+\(\theta\), while its estimator is denoted by \(\hat{\theta}\).
+
+To address our inference question, we aim to construct estimators that
+closely estimate the value of the population parameter. We evaluate how
+``good'' an estimator is by answering three questions:
+
+\begin{itemize}
+\tightlist
+\item
+  How close is our answer to the parameter? \textbf{(Risk / MSE)}
+  \[ MSE(\hat{\theta}) = E[(\hat{\theta} - \theta)]^2\]
+\item
+  Do we get the right answer for the parameter, on average?
+  \textbf{(Bias)}
+  \[\text{Bias}(\hat{\theta}) = E[\hat{\theta} - \theta] = E[\hat{\theta}] - \theta\]
+\item
+  How variable is the answer? \textbf{(Variance)}
+  \[Var(\hat{\theta}) = E[(\theta - E[\theta])^2] \]
+\end{itemize}
+
+This relationship can be illustrated with an archery analogy. Imagine
+that the center of the target is the \(\theta\) and each arrow
+corresponds to a separate parameter estimate \(\hat{\theta}\)
+
+Ideally, we want our estimator to have low bias and low variance, but
+how can we mathematically quantify that? See
+Section~\ref{sec-bias-variance-tradeoff} for more detail.
+
+\subsection{Prediction as Estimation}\label{prediction-as-estimation}
+
+Now that we've established the idea of an estimator, let's see how we
+can apply this learning to the modeling process. To do so, we'll take a
+moment to formalize our data collection and models in the language of
+random variables.
+
+Say we are working with an input variable, \(x\), and a response
+variable, \(Y\). We assume that \(Y\) and \(x\) are linked by some
+relationship \(g\); in other words, \(Y = g(x)\) where \(g\) represents
+some ``universal truth'' or ``law of nature'' that defines the
+underlying relationship between \(x\) and \(Y\). In the image below,
+\(g\) is denoted by the red line.
+
+As data scientists, however, we have no way of directly ``seeing'' the
+underlying relationship \(g\). The best we can do is collect observed
+data out in the real world to try to understand this relationship.
+Unfortunately, the data collection process will always have some
+inherent error (think of the randomness you might encounter when taking
+measurements in a scientific experiment). We say that each observation
+comes with some random error or \textbf{noise} term, \(\epsilon\) (read:
+``epsilon''). This error is assumed to be a random variable with
+expectation \(\mathbb{E}(\epsilon)=0\), variance
+\(\text{Var}(\epsilon) = \sigma^2\), and be i.i.d. across each
+observation. The existence of this random noise means that our
+observations, \(Y(x)\), are \emph{random variables}.
+
+We can only observe our random sample of data, represented by the blue
+points above. From this sample, we want to estimate the true
+relationship \(g\). We do this by constructing the model \(\hat{Y}(x)\)
+to estimate \(g\).
+
+\[\text{True relationship: } g(x)\]
+
+\[\text{Observed relationship: }Y = g(x) + \epsilon\]
+
+\[\text{Prediction: }\hat{Y}(x)\]
+
+When building models, it is also important to note that our choice of
+features will also significantly impact our estimation. In the plot
+below, you can see how the different models (green and purple) can lead
+to different estimates.
+
+\subsubsection{Estimating a Linear
+Relationship}\label{estimating-a-linear-relationship}
+
+If we assume that the true relationship \(g\) is linear, we can express
+the response as \(Y = f_{\theta}(x)\), where our true relationship is
+modeled by \[Y = g(x) + \epsilon\]
+\[ f_{\theta}(x) = Y = \theta_0 + \sum_{j=1}^p \theta_j x_j + \epsilon\]
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-warning-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-warning-color}{\faExclamationTriangle}\hspace{0.5em}{Which expressions are random?}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-warning-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+In our two equations above, the true relationship
+\(g(x) = \theta_0 + \sum_{j=1}^p \theta_j x_j\) is not random, but
+\(\epsilon\) is random. Hence, \(Y = f_{\theta}(x)\) is also random.
+
+\end{tcolorbox}
+
+This true relationship has true, unobservable parameters \(\theta\), and
+it has random noise \(\epsilon\), so we can never observe the true
+relationship. Instead, the next best thing we can do is obtain a sample
+\(\Bbb{X}\), \(\Bbb{Y}\) of \(n\) observed relationships, \((x, Y)\) and
+use it to train a model and obtain an estimate of \(\hat{\theta}\)
+\[\hat{Y}(x) = f_{\hat{\theta}}(x) = \hat{\theta_0} + \sum_{j=1}^p \hat{\theta_j} x_j\]
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-warning-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-warning-color}{\faExclamationTriangle}\hspace{0.5em}{Which expressions are random?}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-warning-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+In our estimating equation above, our sample \(\Bbb{X}\), \(\Bbb{Y}\)
+are random (often due to human error). Hence, the estimates we calculate
+from our samples \(\hat{\theta}\) are also random, so our predictor
+\(\hat{Y}(x)\) is also random.
+
+\end{tcolorbox}
+
+Now taking a look at our original equations, we can see that they both
+have differing sources of randomness. For our observed relationship,
+\(Y = g(x) + \epsilon\), \(\epsilon\) represents errors which occur
+during or after the observation or measurement process. For the
+estimation model, the data we have is a random sample collected from the
+population, which was constructed from decisions made before the
+measurement process.
+
+\section{Bias-Variance Tradeoff}\label{sec-bias-variance-tradeoff}
+
+Recall the model and the data we generated from that model in the last
+section:
+
+\[\text{True relationship: } g(x)\]
+
+\[\text{Observed relationship: }Y = g(x) + \epsilon\]
+
+\[\text{Prediction: }\hat{Y}(x)\]
+
+With this reformulated modeling goal, we can now revisit the
+Bias-Variance Tradeoff from two lectures ago (shown below):
+
+In today's lecture, we'll explore a more mathematical version of the
+graph you see above by introducing the terms model risk, observation
+variance, model bias, and model variance. Eventually, we'll work our way
+up to an updated version of the Bias-Variance Tradeoff graph that you
+see below
+
+\subsection{Model Risk}\label{model-risk}
+
+\textbf{Model risk} is defined as the mean square prediction error of
+the random variable \(\hat{Y}\). It is an expectation across \emph{all}
+samples we could have possibly gotten when fitting the model, which we
+can denote as random variables \(X_1, X_2, \ldots, X_n, Y\). Model risk
+considers the model's performance on any sample that is theoretically
+possible, rather than the specific data that we have collected.
+
+\[\text{model risk }=E\left[(Y-\hat{Y(x)})^2\right]\]
+
+What is the origin of the error encoded by model risk? Note that there
+are two types of errors:
+
+\begin{itemize}
+\tightlist
+\item
+  Chance errors: happen due to randomness alone
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Source 1 \textbf{(Observation Variance)}: randomness in new
+    observations \(Y\) due to random noise \(\epsilon\)
+  \item
+    Source 2 \textbf{(Model Variance)}: randomness in the sample we used
+    to train the models, as samples \(X_1, X_2, \ldots, X_n, Y\) are
+    random
+  \end{itemize}
+\item
+  \textbf{(Model Bias)}: non-random error due to our model being
+  different from the true underlying function \(g\)
+\end{itemize}
+
+Recall the data-generating process we established earlier. There is a
+true underlying relationship \(g\), observed data (with random noise)
+\(Y\), and model \(\hat{Y}\).
+
+To better understand model risk, we'll zoom in on a single data point in
+the plot above.
+
+Remember that \(\hat{Y}(x)\) is a random variable -- it is the
+prediction made for \(x\) after being fit on the specific sample used
+for training. If we had used a different sample for training, a
+different prediction might have been made for this value of \(x\). To
+capture this, the diagram above considers both the prediction
+\(\hat{Y}(x)\) made for a particular random training sample, and the
+\emph{expected} prediction across all possible training samples,
+\(E[\hat{Y}(x)]\).
+
+We can use this simplified diagram to break down the prediction error
+into smaller components. First, start by considering the error on a
+single prediction, \(Y(x)-\hat{Y}(x)\).
+
+We can identify three components of this error.
+
+That is, the error can be written as:
+
+\[Y(x)-\hat{Y}(x) = \epsilon + \left(g(x)-E\left[\hat{Y}(x)\right]\right) + \left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)\]
+\[\newline   \]
+
+The model risk is the expected square of the expression above,
+\(E\left[(Y(x)-\hat{Y}(x))^2\right]\). If we square both sides and then
+take the expectation, we will get the following decomposition of model
+risk:
+
+\[E\left[(Y(x)-\hat{Y}(x))^2\right] = E[\epsilon^2] + \left(g(x)-E\left[\hat{Y}(x)\right]\right)^2 + E\left[\left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)^2\right]\]
+
+It looks like we are missing some cross-product terms when squaring the
+right-hand side, but it turns out that all of those cross-product terms
+are zero. The detailed derivation is out of scope for this class, but a
+proof is included at the end of this note for your reference.
+
+This expression may look complicated at first glance, but we've actually
+already defined each term earlier in this lecture! Let's look at them
+term by term.
+
+\subsubsection{Observation Variance}\label{observation-variance}
+
+The first term in the above decomposition is \(E[\epsilon^2]\). Remember
+\(\epsilon\) is the random noise when observing \(Y\), with expectation
+\(\mathbb{E}(\epsilon)=0\) and variance
+\(\text{Var}(\epsilon) = \sigma^2\). We can show that \(E[\epsilon^2]\)
+is the variance of \(\epsilon\): \[
+\begin{align*}
+\text{Var}(\epsilon) &= E[\epsilon^2] + \left(E[\epsilon]\right)^2\\
+&= E[\epsilon^2] + 0^2\\
+&= \sigma^2.
+\end{align*}
+\]
+
+This term describes how variable the random error \(\epsilon\) (and
+\(Y\)) is for each observation. This is called the \textbf{observation
+variance}. It exists due to the randomness in our observations \(Y\). It
+is a form of \emph{chance error} we talked about in the Sampling
+lecture.
+
+\[\text{observation variance} = \text{Var}(\epsilon) = \sigma^2.\]
+
+The observation variance results from measurement errors when observing
+data or missing information that acts like noise. To reduce this
+observation variance, we could try to get more precise measurements, but
+it is often beyond the control of data scientists. Because of this, the
+observation variance \(\sigma^2\) is sometimes called ``irreducible
+error.''
+
+\subsubsection{Model Variance}\label{model-variance}
+
+We will then look at the last term:
+\(E\left[\left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)^2\right]\).
+If you recall the definition of variance from the last lecture, this is
+precisely \(\text{Var}(\hat{Y}(x))\). We call this the \textbf{model
+variance}.
+
+It describes how much the prediction \(\hat{Y}(x)\) tends to vary when
+we fit the model on different samples. Remember the sample we collect
+can come out very differently, thus the prediction \(\hat{Y}(x)\) will
+also be different. The model variance describes this variability due to
+the randomness in our sampling process. Like observation variance, it is
+also a form of \emph{chance error}---even though the sources of
+randomness are different.
+
+\[\text{model variance} = \text{Var}(\hat{Y}(x)) = E\left[\left(\hat{Y}(x) - E\left[\hat{Y}(x)\right]\right)^2\right]\]
+
+The main reason for the large model variance is because of
+\textbf{overfitting}: we paid too much attention to the details in our
+sample that small differences in our random sample lead to large
+differences in the fitted model. To remediate this, we try to reduce
+model complexity (e.g.~take out some features and limit the magnitude of
+estimated model coefficients) and not fit our model on the noises.
+
+\subsubsection{Model Bias}\label{model-bias}
+
+Finally, the second term is
+\(\left(g(x)-E\left[\hat{Y}(x)\right]\right)^2\). What is this? The term
+\(E\left[\hat{Y}(x)\right] - g(x)\) is called the \textbf{model bias}.
+
+Remember that \(g(x)\) is the fixed underlying truth and \(\hat{Y}(x)\)
+is our fitted model, which is random. Model bias therefore measures how
+far off \(g(x)\) and \(\hat{Y}(x)\) are on average over all possible
+samples.
+
+\[\text{model bias} = E\left[\hat{Y}(x) - g(x)\right] = E\left[\hat{Y}(x)\right] - g(x)\]
+
+The model bias is not random; it's an average measure for a specific
+individual \(x\). If bias is positive, our model tends to overestimate
+\(g(x)\); if it's negative, our model tends to underestimate \(g(x)\).
+And if it's 0, we can say that our model is \textbf{unbiased}.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Unbiased Estimators}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+An \textbf{unbiased model} has a \(\text{model bias } = 0\). In other
+words, our model predicts \(g(x)\) on average.
+
+Similarly, we can define bias for estimators like the mean. The sample
+mean is an \textbf{unbiased estimator} of the population mean, as by
+CLT, \(\mathbb{E}[\bar{X}_n] = \mu\). Therefore, the
+\(\text{estimator bias } = \mathbb{E}[\bar{X}_n] - \mu = 0\).
+
+\end{tcolorbox}
+
+There are two main reasons for large model biases:
+
+\begin{itemize}
+\tightlist
+\item
+  Underfitting: our model is too simple for the data
+\item
+  Lack of domain knowledge: we don't understand what features are useful
+  for the response variable
+\end{itemize}
+
+To fix this, we increase model complexity (but we don't want to
+overfit!) or consult domain experts to see which models make sense. You
+can start to see a tradeoff here: if we increase model complexity, we
+decrease the model bias, but we also risk increasing the model variance.
+
+\subsection{The Decomposition}\label{the-decomposition}
+
+To summarize:
+
+\begin{itemize}
+\tightlist
+\item
+  The \textbf{model risk},
+  \(\mathbb{E}\left[(Y(x)-\hat{Y}(x))^2\right]\), is the mean squared
+  prediction error of the model. It is an expectation and is therefore a
+  \textbf{fixed number} (for a given x).
+\item
+  The \textbf{observation variance}, \(\sigma^2\), is the variance of
+  the random noise in the observations. It describes how variable the
+  random error \(\epsilon\) is for each observation and \textbf{cannot
+  be addressed by modeling}.
+\item
+  The \textbf{model bias}, \(\mathbb{E}\left[\hat{Y}(x)\right]-g(x)\),
+  is how ``off'' the \(\hat{Y}(x)\) is as an estimator of the true
+  underlying relationship \(g(x)\).
+\item
+  The \textbf{model variance}, \(\text{Var}(\hat{Y}(x))\), describes how
+  much the prediction \(\hat{Y}(x)\) tends to vary when we fit the model
+  on different samples.
+\end{itemize}
+
+The above definitions enable us to simplify the decomposition of model
+risk before as:
+
+\[ E[(Y(x) - \hat{Y}(x))^2] = \sigma^2 + (E[\hat{Y}(x)] - g(x))^2 + \text{Var}(\hat{Y}(x)) \]
+\[\text{model risk } = \text{observation variance} + (\text{model bias})^2 \text{+ model variance}\]
+
+This is known as the \textbf{bias-variance tradeoff}. What does it mean?
+Remember that the model risk is a measure of the model's performance.
+Our goal in building models is to keep model risk low; this means that
+we will want to ensure that each component of model risk is kept at a
+small value.
+
+Observation variance is an inherent, random part of the data collection
+process. We aren't able to reduce the observation variance, so we'll
+focus our attention on the model bias and model variance.
+
+In the Feature Engineering lecture, we considered the issue of
+overfitting. We saw that the model's error or bias tends to decrease as
+model complexity increases --- if we design a highly complex model, it
+will tend to make predictions that are closer to the true relationship
+\(g\). At the same time, model variance tends to \emph{increase} as
+model complexity increases; a complex model may overfit to the training
+data, meaning that small differences in the random samples used for
+training lead to large differences in the fitted model. We have a
+problem. To decrease model bias, we could increase the model's
+complexity, which would lead to overfitting and an increase in model
+variance. Alternatively, we could decrease model variance by decreasing
+the model's complexity at the cost of increased model bias due to
+underfitting.
+
+We need to strike a balance. Our goal in model creation is to use a
+complexity level that is high enough to keep bias low, but not so high
+that model variance is large.
+
+\section{{[}Bonus{]} Proof of Bias-Variance
+Decomposition}\label{bonus-proof-of-bias-variance-decomposition}
+
+This section walks through the detailed derivation of the Bias-Variance
+Decomposition in the Bias-Variance Tradeoff section above, and this
+content is out of scope.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-color-frame, left=2mm, breakable, rightrule=.15mm, bottomrule=.15mm, opacityback=0, toprule=.15mm, leftrule=.75mm, arc=.35mm, colback=white]
+
+\vspace{-3mm}\textbf{Click to show}\vspace{3mm}
+
+We want to prove that the model risk can be decomposed as
+
+\[
+\begin{align*}
+E\left[(Y(x)-\hat{Y}(x))^2\right] &= E[\epsilon^2] + \left(g(x)-E\left[\hat{Y}(x)\right]\right)^2 + E\left[\left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right)^2\right].
+\end{align*}
+\]
+
+To prove this, we will first need the following lemma:
+
+If \(V\) and \(W\) are independent random variables then
+\(E[VW] = E[V]E[W]\).
+
+We will prove this in the discrete finite case. Trust that it's true in
+greater generality.
+
+The job is to calculate the weighted average of the values of \(VW\),
+where the weights are the probabilities of those values. Here goes.
+
+\begin{align*}
+E[VW] ~ &= ~ \sum_v\sum_w vwP(V=v \text{ and } W=w) \\
+&= ~ \sum_v\sum_w vwP(V=v)P(W=w) ~~~~ \text{by independence} \\
+&= ~ \sum_v vP(V=v)\sum_w wP(W=w) \\
+&= ~ E[V]E[W]
+\end{align*}
+
+Now we go into the actual proof:
+
+\subsection{Goal}\label{goal}
+
+Decompose the model risk into recognizable components.
+
+\subsection{Step 1}\label{step-1}
+
+\[
+\begin{align*}
+\text{model risk} ~ &= ~ E\left[\left(Y - \hat{Y}(x)\right)^2 \right] \\
+&= ~ E\left[\left(g(x) + \epsilon - \hat{Y}(x)\right)^2 \right] \\
+&= ~ E\left[\left(\epsilon + \left(g(x)- \hat{Y}(x)\right)\right)^2 \right] \\
+&= ~ E\left[\epsilon^2\right] + 2E\left[\epsilon \left(g(x)- \hat{Y}(x)\right)\right] + E\left[\left(g(x) - \hat{Y}(x)\right)^2\right]\\
+\end{align*}
+\]
+
+On the right hand side:
+
+\begin{itemize}
+\tightlist
+\item
+  The first term is the observation variance \(\sigma^2\).
+\item
+  The cross product term is 0 because \(\epsilon\) is independent of
+  \(g(x) - \hat{Y}(x)\) and \(E(\epsilon) = 0\)
+\item
+  The last term is the mean squared difference between our predicted
+  value and the value of the true function at \(x\)
+\end{itemize}
+
+\subsection{Step 2}\label{step-2}
+
+At this stage we have
+
+\[
+\text{model risk} ~ = ~ E\left[\epsilon^2\right] + E\left[\left(g(x) - \hat{Y}(x)\right)^2\right]
+\]
+
+We don't yet have a good understanding of \(g(x) - \hat{Y}(x)\). But we
+do understand the deviation
+\(D_{\hat{Y}(x)} = \hat{Y}(x) - E\left[\hat{Y}(x)\right]\). We know that
+
+\begin{itemize}
+\tightlist
+\item
+  \(E\left[D_{\hat{Y}(x)}\right] ~ = ~ 0\)
+\item
+  \(E\left[D_{\hat{Y}(x)}^2\right] ~ = ~ \text{model variance}\)
+\end{itemize}
+
+So let's add and subtract \(E\left[\hat{Y}(x)\right]\) and see if that
+helps.
+
+\[
+g(x) - \hat{Y}(x) ~ = ~ \left(g(x) - E\left[\hat{Y}(x)\right] \right) + \left(E\left[\hat{Y}(x)\right] - \hat{Y}(x)\right) 
+\]
+
+The first term on the right hand side is the model bias at \(x\). The
+second term is \(-D_{\hat{Y}(x)}\). So
+
+\[
+g(x) - \hat{Y}(x) ~ = ~ \text{model bias} - D_{\hat{Y}(x)}
+\]
+
+\subsection{Step 3}\label{step-3}
+
+Remember that the model bias at \(x\) is a constant, not a random
+variable. Think of it as your favorite number, say 10. Then \[
+\begin{align*}
+E\left[ \left(g(x) - \hat{Y}(x)\right)^2 \right] ~ &= ~ \text{model bias}^2 - 2(\text{model bias})E\left[D_{\hat{Y}(x)}\right] + E\left[D_{\hat{Y}(x)}^2\right] \\
+&= ~ \text{model bias}^2 - 0 + \text{model variance} \\
+&= ~ \text{model bias}^2 + \text{model variance}
+\end{align*}
+\]
+
+Again, the cross-product term is \(0\) because
+\(E\left[D_{\hat{Y}(x)}\right] ~ = ~ 0\).
+
+\subsection{Step 4: Bias-Variance
+Decomposition}\label{step-4-bias-variance-decomposition}
+
+In Step 2, we had:
+
+\[
+\text{model risk} ~ = ~ \text{observation variance} + E\left[\left(g(x) - \hat{Y}(x)\right)^2\right]
+\]
+
+Step 3 showed:
+
+\[
+E\left[ \left(g(x) - \hat{Y}(x)\right)^2 \right] ~ = ~ \text{model bias}^2 + \text{model variance}
+\]
+
+Thus, we have proven the bias-variance decomposition:
+
+\[
+\text{model risk} = \text{observation variance} + \text{model bias}^2 + \text{model variance}.
+\]
+
+That is,
+
+\[
+E\left[(Y(x)-\hat{Y}(x))^2\right] = \sigma^2 + \left(E\left[\hat{Y}(x)\right] - g(x)\right)^2 + E\left[\left(\hat{Y}(x)-E\left[\hat{Y}(x)\right]\right)^2\right]
+\]
+
+\end{tcolorbox}
+
+\bookmarksetup{startatroot}
+
+\chapter{Causal Inference and
+Confounding}\label{causal-inference-and-confounding}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Construct confidence intervals for hypothesis testing using
+  bootstrapping
+\item
+  Understand the assumptions we make and their impact on our regression
+  inference
+\item
+  Explore ways to overcome issues of multicollinearity
+\item
+  Compare regression correlation and causation
+\end{itemize}
+
+\end{tcolorbox}
+
+Last time, we introduced the idea of random variables and how they
+affect the data and model we construct. We also demonstrated the
+decomposition of model risk from a fitted model and dived into the
+bias-variance tradeoff.
+
+In this lecture, we will explore regression inference via hypothesis
+testing, understand how to use bootstrapping under the right
+assumptions, and consider the environment of understanding causality in
+theory and in practice.
+
+\section{Parameter Inference: Interpreting Regression
+Coefficients}\label{parameter-inference-interpreting-regression-coefficients}
+
+There are two main reasons why we build models:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \textbf{Prediction}: using our model to make accurate predictions
+  about unseen data
+\item
+  \textbf{Inference}: using our model to draw conclusions about the
+  underlying relationship(s) between our features and response. We want
+  to understand the complex phenomena occurring in the world we live in.
+  While training is the process of fitting a model, inference is the
+  \emph{process of making predictions}.
+\end{enumerate}
+
+Recall the framework we established in the last lecture. The
+relationship between datapoints is given by \(Y = g(x) + \epsilon\),
+where \(g(x)\) is the \emph{true underlying relationship}, and
+\(\epsilon\) represents randomness. If we assume \(g(x)\) is linear, we
+can express this relationship in terms of the unknown, true model
+parameters \(\theta\).
+
+\[f_{\theta}(x) = g(x) + \epsilon = \theta_0 + \theta_1 x_1 + \ldots + \theta_p x_p + \epsilon\]
+
+Our model attempts to estimate each true population parameter
+\(\theta_i\) using the sample estimates \(\hat{\theta}_i\) calculated
+from the design matrix \(\Bbb{X}\) and response vector \(\Bbb{Y}\).
+
+\[f_{\hat{\theta}}(x) = \hat{\theta}_0 + \hat{\theta}_1 x_1 + \ldots + \hat{\theta}_p x_p\]
+
+Let's pause for a moment. At this point, we're very used to working with
+the idea of a model parameter. But what exactly does each coefficient
+\(\theta_i\) actually \emph{mean}? We can think of each \(\theta_i\) as
+a \emph{slope} of the linear model. If all other variables are held
+constant, a unit change in \(x_i\) will result in a \(\theta_i\) change
+in \(f_{\theta}(x)\). Broadly speaking, a large value of \(\theta_i\)
+means that the feature \(x_i\) has a large effect on the response;
+conversely, a small value of \(\theta_i\) means that \(x_i\) has little
+effect on the response. In the extreme case, if the true parameter
+\(\theta_i\) is 0, then the feature \(x_i\) has \textbf{no effect} on
+\(Y(x)\).
+
+If the true parameter \(\theta_i\) for a particular feature is 0, this
+tells us something pretty significant about the world: there is no
+underlying relationship between \(x_i\) and \(Y(x)\)! But how can we
+test if a parameter is actually 0? As a baseline, we go through our
+usual process of drawing a sample, using this data to fit a model, and
+computing an estimate \(\hat{\theta}_i\). However, we also need to
+consider that if our random sample comes out differently, we may find a
+different result for \(\hat{\theta}_i\). To infer if the true parameter
+\(\theta_i\) is 0, we want to draw our conclusion from the distribution
+of \(\hat{\theta}_i\) estimates we could have drawn across all other
+random samples. This is where
+\href{https://inferentialthinking.com/chapters/11/Testing_Hypotheses.html}{hypothesis
+testing} comes in handy!
+
+To test if the true parameter \(\theta_i\) is 0, we construct a
+\textbf{hypothesis test} where our null hypothesis states that the true
+parameter \(\theta_i\) is 0, and the alternative hypothesis states that
+the true parameter \(\theta_i\) is \emph{not} 0. If our p-value is
+smaller than our cutoff value (usually p = 0.05), we reject the null
+hypothesis in favor of the alternative hypothesis.
+
+\section{Review: Bootstrap
+Resampling}\label{review-bootstrap-resampling}
+
+To determine the properties (e.g., variance) of the sampling
+distribution of an estimator, we'd need access to the population.
+Ideally, we'd want to consider all possible samples in the population,
+compute an estimate for each sample, and study the distribution of those
+estimates.
+
+However, this can be quite expensive and time-consuming. Even more
+importantly, we don't have access to the population ------ we only have
+\emph{one} random sample from the population. How can we consider all
+possible samples if we only have one?
+
+Bootstrapping comes in handy here! With bootstrapping, we treat our
+random sample as a ``population'' and resample from it \emph{with
+replacement}. Intuitively, a random sample resembles the population (if
+it is big enough), so a random \emph{resample} also resembles a random
+sample of the population. When sampling, there are a couple things to
+keep in mind:
+
+\begin{itemize}
+\tightlist
+\item
+  We need to sample the same way we constructed the original sample.
+  Typically, this involves taking a simple random sample with
+  replacement.
+\item
+  New samples must be the same size as the original sample. We need to
+  accurately model the variability of our estimates.
+\end{itemize}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-warning-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-warning-color}{\faExclamationTriangle}\hspace{0.5em}{Why must we resample \emph{with replacement}?}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-warning-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+Given an original sample of size \(n\), we want a resample that has the
+same size \(n\) as the original. Sampling \emph{without} replacement
+will give us the original sample with shuffled rows. Hence, when we
+calculate summary statistics like the average, our sample \emph{without}
+replacement will always have the same average as the original sample,
+defeating the purpose of a bootstrap.
+
+\end{tcolorbox}
+
+Bootstrap resampling is a technique for estimating the sampling
+distribution of an estimator. To execute it, we can follow the
+pseudocode below:
+
+\begin{verbatim}
+collect a random sample of size n (called the bootstrap population)
+
+initiate a list of estimates
+
+repeat 10,000 times:
+    resample with replacement from the bootstrap population
+    apply estimator f to the resample
+    store in list
+
+list of estimates is the bootstrapped sampling distribution of f
+\end{verbatim}
+
+How well does bootstrapping actually represent our population? The
+bootstrapped sampling distribution of an estimator does not exactly
+match the sampling distribution of that estimator, but it is often
+close. Similarly, the variance of the bootstrapped distribution is often
+close to the true variance of the estimator. The example below displays
+the results of different bootstraps from a \emph{known} population using
+a sample size of \(n=50\).
+
+In the real world, we don't know the population distribution. The center
+of the bootstrapped distribution is the estimator applied to our
+original sample, so we have no way of understanding the estimator's true
+expected value; the center and spread of our bootstrap are
+\emph{approximations}. The quality of our bootstrapped distribution also
+depends on the quality of our original sample. If our original sample
+was not representative of the population (like Sample 5 in the image
+above), then the bootstrap is next to useless. In general, bootstrapping
+works better for \emph{large samples}, when the population distribution
+is \emph{not heavily skewed} (no outliers), and when the estimator is
+\emph{``low variance''} (insensitive to extreme values).
+
+Although our bootstrapped sample distribution does not exactly match the
+sampling distribution of the population, we can see that it is
+relatively close. This demonstrates the benefit of bootstrapping ------
+without knowing the actual population distribution, we can still roughly
+approximate the true slope for the model by using only a single random
+sample of 20 cars.
+
+\section{Collinearity}\label{collinearity}
+
+\subsection{Hypothesis Testing Through Bootstrap: Snowy Plover
+Demo}\label{hypothesis-testing-through-bootstrap-snowy-plover-demo}
+
+We can conduct the hypothesis testing described earlier through
+\textbf{bootstrapping} (this equivalence can be proven through the
+\href{https://stats.stackexchange.com/questions/179902/confidence-interval-p-value-duality-vs-frequentist-interpretation-of-cis}{duality
+argument}, which is out of scope for this class). We use bootstrapping
+to compute approximate 95\% confidence intervals for each \(\theta_i\).
+If the interval doesn't contain 0, we reject the null hypothesis at the
+p=5\% level. Otherwise, the data is consistent with the null, as the
+true parameter \emph{could possibly} be 0.
+
+To show an example of this hypothesis testing process, we'll work with
+the \href{https://www.audubon.org/field-guide/bird/snowy-plover}{snowy
+plover} dataset throughout this section. The data are about the eggs and
+newly hatched chicks of the Snowy Plover. The data were collected at the
+Point Reyes National Seashore by a former
+\href{https://openlibrary.org/books/OL2038693M/BLSS_the_Berkeley_interactive_statistical_system}{student
+at Berkeley}. Here's a
+\href{http://cescos.fau.edu/jay/eps/articles/snowyplover.html}{parent
+bird and some eggs}.
+
+Note that \texttt{Egg\ Length} and \texttt{Egg\ Breadth} (widest
+diameter) are measured in millimeters, and \texttt{Egg\ Weight} and
+\texttt{Bird\ Weight} are measured in grams. For reference, a standard
+paper clip weighs about one gram.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\NormalTok{eggs }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/snowy\_plover.csv"}\NormalTok{)}
+\NormalTok{eggs.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& egg\_weight & egg\_length & egg\_breadth & bird\_weight \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 7.4 & 28.80 & 21.84 & 5.2 \\
+1 & 7.7 & 29.04 & 22.45 & 5.4 \\
+2 & 7.9 & 29.36 & 22.48 & 5.6 \\
+3 & 7.5 & 30.10 & 21.71 & 5.3 \\
+4 & 8.3 & 30.17 & 22.75 & 5.9 \\
+\end{longtable}
+
+Our goal will be to predict the weight of a newborn plover chick, which
+we assume follows the true relationship \(Y = f_{\theta}(x)\) below.
+
+\[\text{bird\_weight} = \theta_0 + \theta_1 \text{egg\_weight} + \theta_2 \text{egg\_length} + \theta_3 \text{egg\_breadth} + \epsilon\]
+
+Note that for each \(i\), the parameter \(\theta_i\) is a fixed number,
+but it is unobservable. We can only estimate it. The random error
+\(\epsilon\) is also unobservable, but it is assumed to have expectation
+0 and be independent and identically distributed across eggs.
+
+Say we wish to determine if the \texttt{egg\_weight} impacts the
+\texttt{bird\_weight} of a chick -- we want to infer if \(\theta_1\) is
+equal to 0.
+
+First, we define our hypotheses:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Null hypothesis}: the true parameter \(\theta_1\) is 0; any
+  variation is due to random chance.
+\item
+  \textbf{Alternative hypothesis}: the true parameter \(\theta_1\) is
+  not 0.
+\end{itemize}
+
+Next, we use our data to fit a model \(\hat{Y} = f_{\hat{\theta}}(x)\)
+that approximates the relationship above. This gives us the
+\textbf{observed value} of \(\hat{\theta}_1\) from our data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ sklearn.linear\_model }\ImportTok{import}\NormalTok{ LinearRegression}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+
+\NormalTok{X }\OperatorTok{=}\NormalTok{ eggs[[}\StringTok{"egg\_weight"}\NormalTok{, }\StringTok{"egg\_length"}\NormalTok{, }\StringTok{"egg\_breadth"}\NormalTok{]]}
+\NormalTok{Y }\OperatorTok{=}\NormalTok{ eggs[}\StringTok{"bird\_weight"}\NormalTok{]}
+
+\NormalTok{model }\OperatorTok{=}\NormalTok{ LinearRegression()}
+\NormalTok{model.fit(X, Y)}
+
+\CommentTok{\# This gives an array containing the fitted model parameter estimates}
+\NormalTok{thetas }\OperatorTok{=}\NormalTok{ model.coef\_}
+
+\CommentTok{\# Put the parameter estimates in a nice table for viewing}
+\NormalTok{display(pd.DataFrame(}
+\NormalTok{  [model.intercept\_] }\OperatorTok{+} \BuiltInTok{list}\NormalTok{(model.coef\_),}
+\NormalTok{  columns}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}theta\_hat\textquotesingle{}}\NormalTok{],}
+\NormalTok{  index}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}intercept\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}egg\_weight\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}egg\_length\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}egg\_breadth\textquotesingle{}}\NormalTok{]}
+\NormalTok{))}
+
+\BuiltInTok{print}\NormalTok{(}\StringTok{"RMSE"}\NormalTok{, np.mean((Y }\OperatorTok{{-}}\NormalTok{ model.predict(X)) }\OperatorTok{**} \DecValTok{2}\NormalTok{))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& theta\_hat \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+intercept & -4.605670 \\
+egg\_weight & 0.431229 \\
+egg\_length & 0.066570 \\
+egg\_breadth & 0.215914 \\
+\end{longtable}
+
+\begin{verbatim}
+RMSE 0.045470853802757547
+\end{verbatim}
+
+Our single sample of data gives us the value of
+\(\hat{\theta}_1=0.431\). To get a sense of how this estimate might vary
+if we were to draw different random samples, we will use
+\href{https://inferentialthinking.com/chapters/13/2/Bootstrap.html?}{bootstrapping}.
+As a refresher, to construct a bootstrap sample, we will draw a resample
+from the collected data that:
+
+\begin{itemize}
+\tightlist
+\item
+  Has the same sample size as the collected data
+\item
+  Is drawn with replacement (this ensures that we don't draw the exact
+  same sample every time!)
+\end{itemize}
+
+We draw a bootstrap sample, use this sample to fit a model, and record
+the result for \(\hat{\theta}_1\) on this bootstrapped sample. We then
+repeat this process many times to generate a \textbf{bootstrapped
+empirical distribution} of \(\hat{\theta}_1\). This gives us an estimate
+of what the true distribution of \(\hat{\theta}_1\) across all possible
+samples might look like.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Set a random seed so you generate the same random sample as staff}
+\CommentTok{\# In the "real world", we wouldn\textquotesingle{}t do this}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\NormalTok{np.random.seed(}\DecValTok{1337}\NormalTok{)}
+
+\CommentTok{\# Set the sample size of each bootstrap sample}
+\NormalTok{n }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(eggs)}
+
+\CommentTok{\# Create a list to store all the bootstrapped estimates}
+\NormalTok{estimates }\OperatorTok{=}\NormalTok{ []}
+
+\CommentTok{\# Generate a bootstrap resample from \textasciigrave{}eggs\textasciigrave{} and find an estimate for theta\_1 using this sample. }
+\CommentTok{\# Repeat 10000 times.}
+\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{10000}\NormalTok{):}
+    \CommentTok{\# draw a bootstrap sample}
+\NormalTok{    bootstrap\_resample }\OperatorTok{=}\NormalTok{ eggs.sample(n, replace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{    X\_bootstrap }\OperatorTok{=}\NormalTok{ bootstrap\_resample[[}\StringTok{"egg\_weight"}\NormalTok{, }\StringTok{"egg\_length"}\NormalTok{, }\StringTok{"egg\_breadth"}\NormalTok{]]}
+\NormalTok{    Y\_bootstrap }\OperatorTok{=}\NormalTok{ bootstrap\_resample[}\StringTok{"bird\_weight"}\NormalTok{]}
+    
+    \CommentTok{\# use bootstrapped sample to fit a model}
+\NormalTok{    bootstrap\_model }\OperatorTok{=}\NormalTok{ LinearRegression()}
+\NormalTok{    bootstrap\_model.fit(X\_bootstrap, Y\_bootstrap)}
+\NormalTok{    bootstrap\_thetas }\OperatorTok{=}\NormalTok{ bootstrap\_model.coef\_}
+    
+    \CommentTok{\# record the result for theta\_1}
+\NormalTok{    estimates.append(bootstrap\_thetas[}\DecValTok{0}\NormalTok{])}
+    
+\CommentTok{\# calculate the 95\% confidence interval }
+\NormalTok{lower }\OperatorTok{=}\NormalTok{ np.percentile(estimates, }\FloatTok{2.5}\NormalTok{, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+\NormalTok{upper }\OperatorTok{=}\NormalTok{ np.percentile(estimates, }\FloatTok{97.5}\NormalTok{, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+\NormalTok{conf\_interval }\OperatorTok{=}\NormalTok{ (lower, upper)}
+\NormalTok{conf\_interval}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(np.float64(-0.2586481195684874), np.float64(1.103424385420405))
+\end{verbatim}
+
+Our bootstrapped 95\% confidence interval for \(\theta_1\) is
+\([-0.259, 1.103]\). Immediately, we can see that 0 \emph{is} indeed
+contained in this interval -- this means that we \emph{cannot} conclude
+that \(\theta_1\) is non-zero! More formally, we fail to reject the null
+hypothesis (that \(\theta_1\) is 0) under a 5\% p-value cutoff.
+
+We can repeat this process to construct 95\% confidence intervals for
+the other parameters of the model.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{np.random.seed(}\DecValTok{1337}\NormalTok{)}
+
+\NormalTok{theta\_0\_estimates }\OperatorTok{=}\NormalTok{ []}
+\NormalTok{theta\_1\_estimates }\OperatorTok{=}\NormalTok{ []}
+\NormalTok{theta\_2\_estimates }\OperatorTok{=}\NormalTok{ []}
+\NormalTok{theta\_3\_estimates }\OperatorTok{=}\NormalTok{ []}
+
+
+\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{10000}\NormalTok{):}
+\NormalTok{    bootstrap\_resample }\OperatorTok{=}\NormalTok{ eggs.sample(n, replace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{    X\_bootstrap }\OperatorTok{=}\NormalTok{ bootstrap\_resample[[}\StringTok{"egg\_weight"}\NormalTok{, }\StringTok{"egg\_length"}\NormalTok{, }\StringTok{"egg\_breadth"}\NormalTok{]]}
+\NormalTok{    Y\_bootstrap }\OperatorTok{=}\NormalTok{ bootstrap\_resample[}\StringTok{"bird\_weight"}\NormalTok{]}
+    
+\NormalTok{    bootstrap\_model }\OperatorTok{=}\NormalTok{ LinearRegression()}
+\NormalTok{    bootstrap\_model.fit(X\_bootstrap, Y\_bootstrap)}
+\NormalTok{    bootstrap\_theta\_0 }\OperatorTok{=}\NormalTok{ bootstrap\_model.intercept\_}
+\NormalTok{    bootstrap\_theta\_1, bootstrap\_theta\_2, bootstrap\_theta\_3 }\OperatorTok{=}\NormalTok{ bootstrap\_model.coef\_}
+    
+\NormalTok{    theta\_0\_estimates.append(bootstrap\_theta\_0)}
+\NormalTok{    theta\_1\_estimates.append(bootstrap\_theta\_1)}
+\NormalTok{    theta\_2\_estimates.append(bootstrap\_theta\_2)}
+\NormalTok{    theta\_3\_estimates.append(bootstrap\_theta\_3)}
+    
+\NormalTok{theta\_0\_lower, theta\_0\_upper }\OperatorTok{=}\NormalTok{ np.percentile(theta\_0\_estimates, }\FloatTok{2.5}\NormalTok{), np.percentile(theta\_0\_estimates, }\FloatTok{97.5}\NormalTok{)}
+\NormalTok{theta\_1\_lower, theta\_1\_upper }\OperatorTok{=}\NormalTok{ np.percentile(theta\_1\_estimates, }\FloatTok{2.5}\NormalTok{), np.percentile(theta\_1\_estimates, }\FloatTok{97.5}\NormalTok{)}
+\NormalTok{theta\_2\_lower, theta\_2\_upper }\OperatorTok{=}\NormalTok{ np.percentile(theta\_2\_estimates, }\FloatTok{2.5}\NormalTok{), np.percentile(theta\_2\_estimates, }\FloatTok{97.5}\NormalTok{)}
+\NormalTok{theta\_3\_lower, theta\_3\_upper }\OperatorTok{=}\NormalTok{ np.percentile(theta\_3\_estimates, }\FloatTok{2.5}\NormalTok{), np.percentile(theta\_3\_estimates, }\FloatTok{97.5}\NormalTok{)}
+
+\CommentTok{\# Make a nice table to view results}
+\NormalTok{pd.DataFrame(\{}\StringTok{"lower"}\NormalTok{:[theta\_0\_lower, theta\_1\_lower, theta\_2\_lower, theta\_3\_lower], }\StringTok{"upper"}\NormalTok{:[theta\_0\_upper, }\OperatorTok{\textbackslash{}}
+\NormalTok{                theta\_1\_upper, theta\_2\_upper, theta\_3\_upper]\}, index}\OperatorTok{=}\NormalTok{[}\StringTok{"theta\_0"}\NormalTok{, }\StringTok{"theta\_1"}\NormalTok{, }\StringTok{"theta\_2"}\NormalTok{, }\StringTok{"theta\_3"}\NormalTok{])}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& lower & upper \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+theta\_0 & -15.278542 & 5.161473 \\
+theta\_1 & -0.258648 & 1.103424 \\
+theta\_2 & -0.099138 & 0.208557 \\
+theta\_3 & -0.257141 & 0.758155 \\
+\end{longtable}
+
+Something's off here. Notice that 0 is included in the 95\% confidence
+interval for \emph{every} parameter of the model. Using the
+interpretation we outlined above, this would suggest that we can't say
+for certain that \emph{any} of the input variables impact the response
+variable! This makes it seem like our model can't make any predictions
+-- and yet, each model we fit in our bootstrap experiment above could
+very much make predictions of \(Y\).
+
+How can we explain this result? Think back to how we first interpreted
+the parameters of a linear model. We treated each \(\theta_i\) as a
+slope, where a unit increase in \(x_i\) leads to a \(\theta_i\) increase
+in \(Y\), \textbf{if all other variables are held constant}. It turns
+out that this last assumption is very important. If variables in our
+model are somehow related to one another, then it might not be possible
+to have a change in one of them while holding the others constant. This
+means that our interpretation framework is no longer valid! In the
+models we fit above, we incorporated \texttt{egg\_length},
+\texttt{egg\_breadth}, and \texttt{egg\_weight} as input variables.
+These variables are very likely related to one another -- an egg with
+large \texttt{egg\_length} and \texttt{egg\_breadth} will likely be
+heavy in \texttt{egg\_weight}. This means that the model parameters
+cannot be meaningfully interpreted as slopes.
+
+To support this conclusion, we can visualize the relationships between
+our feature variables. Notice the strong positive association between
+the features.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\NormalTok{sns.pairplot(eggs[[}\StringTok{"egg\_length"}\NormalTok{, }\StringTok{"egg\_breadth"}\NormalTok{, }\StringTok{"egg\_weight"}\NormalTok{, }\StringTok{\textquotesingle{}bird\_weight\textquotesingle{}}\NormalTok{]])}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{inference_causality/inference_causality_files/figure-pdf/cell-6-output-1.pdf}
+
+This issue is known as \textbf{collinearity}, sometimes also called
+\textbf{multicollinearity}. Collinearity occurs when one feature can be
+predicted fairly accurately by a linear combination of the other
+features, which happens when one feature is highly correlated with the
+others.
+
+Why is collinearity a problem? Its consequences span several aspects of
+the modeling process:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Inference}: Slopes can't be interpreted for an inference task.
+\item
+  \textbf{Model Variance}: If features strongly influence one another,
+  even small changes in the sampled data can lead to large changes in
+  the estimated slopes.
+\item
+  \textbf{Unique Solution}: If one feature is a linear combination of
+  the other features, the design matrix will not be full rank, and
+  \(\mathbb{X}^{\top}\mathbb{X}\) is not invertible. This means that
+  least squares does not have a unique solution. See
+  \href{https://ds100.org/course-notes/ols/ols.html\#bonus-uniqueness-of-the-solution}{this
+  section} of Course Note 12 for more on this.
+\end{itemize}
+
+The take-home point is that we need to be careful with what features we
+select for modeling. If two features likely encode similar information,
+it is often a good idea to choose only one of them as an input variable.
+
+\subsection{A Simpler Model}\label{a-simpler-model}
+
+Let us now consider a more interpretable model: we instead assume a true
+relationship using only egg weight:
+
+\[f_\theta(x) = \theta_0 + \theta_1 \text{egg\_weight} + \epsilon\]
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ sklearn.linear\_model }\ImportTok{import}\NormalTok{ LinearRegression}
+\NormalTok{X\_int }\OperatorTok{=}\NormalTok{ eggs[[}\StringTok{"egg\_weight"}\NormalTok{]]}
+\NormalTok{Y\_int }\OperatorTok{=}\NormalTok{ eggs[}\StringTok{"bird\_weight"}\NormalTok{]}
+
+\NormalTok{model\_int }\OperatorTok{=}\NormalTok{ LinearRegression()}
+
+\NormalTok{model\_int.fit(X\_int, Y\_int)}
+
+\CommentTok{\# This gives an array containing the fitted model parameter estimates}
+\NormalTok{thetas\_int }\OperatorTok{=}\NormalTok{ model\_int.coef\_}
+
+\CommentTok{\# Put the parameter estimates in a nice table for viewing}
+\NormalTok{pd.DataFrame(\{}\StringTok{"theta\_hat"}\NormalTok{:[model\_int.intercept\_, thetas\_int[}\DecValTok{0}\NormalTok{]]\}, index}\OperatorTok{=}\NormalTok{[}\StringTok{"theta\_0"}\NormalTok{, }\StringTok{"theta\_1"}\NormalTok{])}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+& theta\_hat \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+theta\_0 & -0.058272 \\
+theta\_1 & 0.718515 \\
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+
+\CommentTok{\# Set a random seed so you generate the same random sample as staff}
+\CommentTok{\# In the "real world", we wouldn\textquotesingle{}t do this}
+\NormalTok{np.random.seed(}\DecValTok{1337}\NormalTok{)}
+
+\CommentTok{\# Set the sample size of each bootstrap sample}
+\NormalTok{n }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(eggs)}
+
+\CommentTok{\# Create a list to store all the bootstrapped estimates}
+\NormalTok{estimates\_int }\OperatorTok{=}\NormalTok{ []}
+
+\CommentTok{\# Generate a bootstrap resample from \textasciigrave{}eggs\textasciigrave{} and find an estimate for theta\_1 using this sample. }
+\CommentTok{\# Repeat 10000 times.}
+\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{10000}\NormalTok{):}
+\NormalTok{    bootstrap\_resample\_int }\OperatorTok{=}\NormalTok{ eggs.sample(n, replace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{    X\_bootstrap\_int }\OperatorTok{=}\NormalTok{ bootstrap\_resample\_int[[}\StringTok{"egg\_weight"}\NormalTok{]]}
+\NormalTok{    Y\_bootstrap\_int }\OperatorTok{=}\NormalTok{ bootstrap\_resample\_int[}\StringTok{"bird\_weight"}\NormalTok{]}
+    
+\NormalTok{    bootstrap\_model\_int }\OperatorTok{=}\NormalTok{ LinearRegression()}
+\NormalTok{    bootstrap\_model\_int.fit(X\_bootstrap\_int, Y\_bootstrap\_int)}
+\NormalTok{    bootstrap\_thetas\_int }\OperatorTok{=}\NormalTok{ bootstrap\_model\_int.coef\_}
+    
+\NormalTok{    estimates\_int.append(bootstrap\_thetas\_int[}\DecValTok{0}\NormalTok{])}
+
+\NormalTok{plt.figure(dpi}\OperatorTok{=}\DecValTok{120}\NormalTok{)}
+\NormalTok{sns.histplot(estimates\_int, stat}\OperatorTok{=}\StringTok{"density"}\NormalTok{)}
+\NormalTok{plt.xlabel(}\VerbatimStringTok{r"$\textbackslash{}hat\{\textbackslash{}theta\}\_1$"}\NormalTok{)}
+\NormalTok{plt.title(}\VerbatimStringTok{r"Bootstrapped estimates $\textbackslash{}hat\{\textbackslash{}theta\}\_1$ Under the Interpretable Model"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{inference_causality/inference_causality_files/figure-pdf/cell-8-output-1.pdf}
+
+Notice how the interpretable model performs almost as well as our other
+model:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ sklearn.metrics }\ImportTok{import}\NormalTok{ mean\_squared\_error}
+
+\NormalTok{rmse }\OperatorTok{=}\NormalTok{ mean\_squared\_error(Y, model.predict(X))}
+\NormalTok{rmse\_int }\OperatorTok{=}\NormalTok{ mean\_squared\_error(Y\_int, model\_int.predict(X\_int))}
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}RMSE of Original Model: }\SpecialCharTok{\{}\NormalTok{rmse}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
+\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}RMSE of Interpretable Model: }\SpecialCharTok{\{}\NormalTok{rmse\_int}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+RMSE of Original Model: 0.045470853802757547
+RMSE of Interpretable Model: 0.04649394137555684
+\end{verbatim}
+
+Yet, the confidence interval for the true parameter \(\theta_{1}\) does
+not contain zero.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{lower\_int }\OperatorTok{=}\NormalTok{ np.percentile(estimates\_int, }\FloatTok{2.5}\NormalTok{)}
+\NormalTok{upper\_int }\OperatorTok{=}\NormalTok{ np.percentile(estimates\_int, }\FloatTok{97.5}\NormalTok{)}
+
+\NormalTok{conf\_interval\_int }\OperatorTok{=}\NormalTok{ (lower\_int, upper\_int)}
+\NormalTok{conf\_interval\_int}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(np.float64(0.6029335250209632), np.float64(0.8208401738546208))
+\end{verbatim}
+
+In retrospect, it's no surprise that the weight of an egg best predicts
+the weight of a newly-hatched chick.
+
+A model with highly correlated variables prevents us from interpreting
+how the variables are related to the prediction.
+
+\subsection{Reminder: Assumptions
+Matter}\label{reminder-assumptions-matter}
+
+Keep the following in mind: All inference assumes that the regression
+model holds.
+
+\begin{itemize}
+\tightlist
+\item
+  If the model doesn't hold, the inference might not be valid.
+\item
+  If the
+  \href{https://inferentialthinking.com/chapters/13/3/Confidence_Intervals.html?highlight=p\%20value\%20confidence\%20interval\#care-in-using-the-bootstrap-percentile-method}{assumptions
+  of the bootstrap} don't hold\ldots{}
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Sample size n is large
+  \item
+    Sample is representative of population distribution (drawn i.i.d.,
+    unbiased)
+  \end{itemize}
+
+  \ldots then the results of the bootstrap might not be valid.
+\end{itemize}
+
+\section{{[}Bonus Content{]}}\label{bonus-content}
+
+Note: the content in this section is out of scope.
+
+\subsection{Prediction vs Causation}\label{prediction-vs-causation}
+
+The difference between correlation/prediction vs.~causation is best
+illustrated through examples.
+
+Some questions about \textbf{correlation / prediction} include:
+
+\begin{itemize}
+\tightlist
+\item
+  Are homes with granite countertops worth more money?
+\item
+  Is college GPA higher for students who win a certain scholarship?
+\item
+  Are breastfed babies less likely to develop asthma?
+\item
+  Do cancer patients given some aggressive treatment have a higher
+  5-year survival rate?
+\item
+  Are people who smoke more likely to get cancer?
+\end{itemize}
+
+While these may sound like causal questions, they are not! Questions
+about \textbf{causality} are about the \textbf{effects} of
+\textbf{interventions} (not just passive observation). For example:
+
+\begin{itemize}
+\tightlist
+\item
+  How much do granite countertops \textbf{raise} the value of a house?
+\item
+  Does getting the scholarship \textbf{improve} students' GPAs?
+\item
+  Does breastfeeding \textbf{protect} babies against asthma?
+\item
+  Does the treatment \textbf{improve} cancer survival?
+\item
+  Does smoking \textbf{cause} cancer?
+\end{itemize}
+
+Note, however, that regression coefficients are sometimes called
+``effects'', which can be deceptive!
+
+When using data alone, \textbf{predictive questions} (i.e., are
+breastfed babies healthier?) can be answered, but \textbf{causal
+questions} (i.e., does breastfeeding improve babies' health?) cannot.
+The reason for this is that there are many possible causes for our
+predictive question. For example, possible explanations for why
+breastfed babies are healthier on average include:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Causal effect:} breastfeeding makes babies healthier
+\item
+  \textbf{Reverse causality:} healthier babies more likely to
+  successfully breastfeed
+\item
+  \textbf{Common cause:} healthier / richer parents have healthier
+  babies and are more likely to breastfeed
+\end{itemize}
+
+We cannot tell which explanations are true (or to what extent) just by
+observing (\(x\),\(y\)) pairs. Additionally, causal questions implicitly
+involve \textbf{counterfactuals}, events that didn't happen. For
+example, we could ask, \textbf{would} the \textbf{same} breastfed babies
+have been less healthy \textbf{if} they hadn't been breastfed?
+Explanation 1 from above implies they would be, but explanations 2 and 3
+do not.
+
+\subsection{Confounders}\label{confounders}
+
+Let T represent a treatment (for example, alcohol use) and Y represent
+an outcome (for example, lung cancer).
+
+A \textbf{confounder} is a variable that affects both T and Y,
+distorting the correlation between them. Using the example above, rich
+parents could be a confounder for breastfeeding and a baby's health.
+Confounders can be a measured covariate (a feature) or an unmeasured
+variable we don't know about, and they generally cause problems, as the
+relationship between T and Y is affected by data we cannot see. We
+commonly \emph{assume that all confounders are observed} (this is also
+called \textbf{ignorability}).
+
+\subsection{How to perform causal
+inference?}\label{how-to-perform-causal-inference}
+
+In a \textbf{randomized experiment}, participants are randomly assigned
+into two groups: treatment and control. A treatment is applied
+\emph{only} to the treatment group. We assume ignorability and gather as
+many measurements as possible so that we can compare them between the
+control and treatment groups to determine whether or not the treatment
+has a true effect or is just a confounding factor.
+
+However, often, randomly assigning treatments is impractical or
+unethical. For example, assigning a treatment of cigarettes to test the
+effect of smoking on the lungs would not only be impractical but also
+unethical.
+
+An alternative to bypass this issue is to utilize \textbf{observational
+studies}. This can be done by obtaining two participant groups separated
+based on some identified treatment variable. Unlike randomized
+experiments, however, we cannot assume ignorability here: the
+participants could have separated into two groups based on other
+covariates! In addition, there could also be unmeasured confounders.
+
+\bookmarksetup{startatroot}
+
+\chapter{SQL I}\label{sql-i}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Recognizing situations where we need ``bigger'' tools for manipulating
+  data
+\item
+  Write basic SQL queries using \texttt{SELECT}, \texttt{FROM},
+  \texttt{WHERE}, \texttt{ORDER\ BY}, \texttt{LIMIT}, and
+  \texttt{OFFSET}
+\item
+  Perform aggregations using \texttt{GROUP\ BY}
+\end{itemize}
+
+\end{tcolorbox}
+
+So far in the course, we have made our way through the entire data
+science lifecycle: we learned how to load and explore a dataset,
+formulate questions, and use the tools of prediction and inference to
+come up with answers. For the remaining weeks of the semester, we are
+going to make a second pass through the lifecycle, this time with a
+different set of tools, ideas, and abstractions.
+
+\section{Databases}\label{databases}
+
+With this goal in mind, let's go back to the very beginning of the
+lifecycle. We first started our work in data analysis by looking at the
+\texttt{pandas} library, which offered us powerful tools to manipulate
+tabular data stored in (primarily) CSV files. CSVs work well when
+analyzing relatively small datasets (less than 10GB) that don't need to
+be shared across many users. In research and industry, however, data
+scientists often need to access enormous bodies of data that cannot be
+easily stored in a CSV format. Collaborating with others when working
+with CSVs can also be tricky ------ a real-world data scientist may run
+into problems when multiple users try to make modifications or more dire
+security issues arise regarding who should and should not have access to
+the data.
+
+A \textbf{database} is a large, organized collection of data. Databases
+are administered by \textbf{Database Management Systems (DBMS)}, which
+are software systems that store, manage, and facilitate access to one or
+more databases. Databases help mitigate many of the issues that come
+with using CSVs for data storage: they provide reliable storage that can
+survive system crashes or disk failures, are optimized to compute on
+data that does not fit into memory, and contain special data structures
+to improve performance. Using databases rather than CSVs offers further
+benefits from the standpoint of data management. A DBMS can apply
+settings that configure how data is organized, block certain data
+anomalies (for example, enforcing non-negative weights or ages), and
+determine who is allowed access to the data. It can also ensure safe
+concurrent operations where multiple users reading and writing to the
+database will not lead to fatal errors. Below, you can see the
+functionality of the different types of data storage and management
+architectures. In data science, common large-scale DBMS systems used are
+Google BigQuery, Amazon Redshift, Snowflake, Databricks, Microsoft SQL
+Server, and more. To learn more about these, consider taking
+\href{https://www.data101.org/sp24/}{Data 101}!
+
+As you may have guessed, we can't use our usual \texttt{pandas} methods
+to work with data in a database. Instead, we'll turn to Structured Query
+Language.
+
+\section{Intro to SQL}\label{intro-to-sql}
+
+\textbf{Structured Query Language}, or \textbf{SQL} (commonly pronounced
+``sequel,'' though this is the subject of
+\href{https://patorjk.com/blog/2012/01/26/pronouncing-sql-s-q-l-or-sequel/}{fierce
+debate}), is a special programming language designed to communicate with
+databases, and it is the dominant language/technology for working with
+data. You may have encountered it in classes like CS 61A or Data C88C
+before, and you likely will encounter it in the future. It is a language
+of tables: all inputs and outputs are tables. Unlike Python, it is a
+\textbf{declarative programming language} -- this means that rather than
+writing the exact logic needed to complete a task, a piece of SQL code
+``declares'' what the desired final output should be and leaves the
+program to determine what logic should be implemented. This logic
+differs depending on the SQL code itself or on the system it's running
+on (ie. \href{https://www.mongodb.com/}{MongoDB},
+\href{https://www.sqlite.org/}{SQLite},
+\href{https://duckdb.org/}{DuckDB}, etc.). Most systems don't follow the
+standards, and every system you work with will be a little different.
+
+For the purposes of Data 100, we use SQLite or DuckDB. SQLite is an
+easy-to-use library that allows users to directly manipulate a database
+file or an in-memory database with a simplified version of SQL. It's
+commonly used to store data for small apps on mobile devices and is
+optimized for simplicity and speed of simple data tasks. DuckDB is an
+easy-to-use library that lets you directly manipulate a database file,
+collection of table formatted files (e.g., CSV), or in-memory
+\texttt{pandas} \texttt{DataFrame}s using a more complete version of
+SQL. It's optimized for simplicity and speed of advanced data analysis
+tasks and is becoming increasingly popular for data analysis tasks on
+large datasets.
+
+It is important to reiterate that SQL is an entirely different language
+from Python. However, Python \emph{does} have special engines that allow
+us to run SQL code in a Jupyter notebook. While this is typically not
+how SQL is used outside of an educational setting, we will use this
+workflow to illustrate how SQL queries are constructed using the tools
+we've already worked with this semester. You will learn more about how
+to run SQL queries in Jupyter in an upcoming lab and homework.
+
+The syntax below will seem unfamiliar to you; for now, just focus on
+understanding the output displayed. We will clarify the SQL code in a
+bit.
+
+To start, we'll look at a database called \texttt{example\_duck.db} and
+connect to it using DuckDB.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Load the SQL Alchemy Python library and DuckDB}
+\ImportTok{import}\NormalTok{ sqlalchemy}
+\ImportTok{import}\NormalTok{ duckdb}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Load \%\%sql cell magic}
+\OperatorTok{\%}\NormalTok{load\_ext sql}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Connect to the database}
+\OperatorTok{\%}\NormalTok{sql duckdb:}\OperatorTok{///}\NormalTok{data}\OperatorTok{/}\NormalTok{example\_duck.db }\OperatorTok{{-}{-}}\NormalTok{alias duck}
+\end{Highlighting}
+\end{Shaded}
+
+Now that we're connected, let's make some queries!
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*}\NormalTok{ FROM Dragon}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+name & year & cute \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+Thanks to the \texttt{pandas} magic, the resulting return data is
+displayed in a format almost identical to our \texttt{pandas} tables but
+without an index.
+
+\section{Tables and Schema}\label{tables-and-schema}
+
+Looking at the \texttt{Dragon} table above, we can see that it contains
+contains three columns. The first of these, \texttt{"name"}, contains
+text data. The \texttt{"year"} column contains integer data, with the
+constraint that year values must be greater than or equal to 2000. The
+final column, \texttt{"cute"}, contains integer data with no
+restrictions on allowable values.
+
+Now, let's look at the \textbf{schema} of our database. A schema
+describes the logical structure of a table. Whenever a new table is
+created, the creator must declare its schema.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*} 
+\NormalTok{FROM sqlite\_master}
+\NormalTok{WHERE }\BuiltInTok{type}\OperatorTok{=}\StringTok{\textquotesingle{}table\textquotesingle{}}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+type & name & tbl\_name & rootpage & sql \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+The summary above displays information about the database; it contains
+four tables named \texttt{sqlite\_sequence}, \texttt{Dragon},
+\texttt{Dish}, and \texttt{Scene}. The rightmost column above lists the
+command that was used to construct each table.
+
+Let's look more closely at the command used to create the
+\texttt{Dragon} table (the second entry above).
+
+\begin{verbatim}
+CREATE TABLE Dragon (name TEXT PRIMARY KEY,
+                     year INTEGER CHECK (year >= 2000),
+                     cute INTEGER)
+\end{verbatim}
+
+The statement \texttt{CREATE\ TABLE} is used to specify the
+\textbf{schema} of the table -- a description of what logic is used to
+organize the table. Schema follows a set format:
+
+\begin{itemize}
+\item
+  \texttt{ColName}: the name of a column
+\item
+  \texttt{DataType}: the type of data to be stored in a column. Some of
+  the most common SQL data types are:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \texttt{INT} (integers)
+  \item
+    \texttt{FLOAT} (floating point numbers)
+  \item
+    \texttt{TEXT} (strings)
+  \item
+    \texttt{BLOB} (arbitrary data, such as audio/video files)
+  \item
+    \texttt{DATETIME} (a date and time)
+  \end{itemize}
+\item
+  \texttt{Constraint}: some restriction on the data to be stored in the
+  column. Common constraints are:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \texttt{CHECK} (data must obey a certain condition)
+  \item
+    \texttt{PRIMARY\ KEY} (designate a column as the table's primary
+    key)
+  \item
+    \texttt{NOT\ NULL} (data cannot be null)
+  \item
+    \texttt{DEFAULT} (a default fill value if no specific entry is
+    given)
+  \end{itemize}
+\end{itemize}
+
+Note that different implementations of SQL (e.g.,
+\href{https://duckdb.org/docs/sql/data_types/overview.html}{DuckDB},
+\href{https://www.sqlite.org/datatype3.html}{SQLite},
+\href{https://dev.mysql.com/doc/refman/8.0/en/data-types.html}{MySQL})
+will support different types. In Data 100, we'll primarily use DuckDB.
+
+Database tables (also referred to as \textbf{relations}) are structured
+much like \texttt{DataFrame}s in \texttt{pandas}. Each row, sometimes
+called a \textbf{tuple}, represents a single record in the dataset. Each
+column, sometimes called an \textbf{attribute} or \textbf{field},
+describes some feature of the record.
+
+\subsection{Primary Keys}\label{primary-keys}
+
+The \textbf{primary key} is a set of column(s) that uniquely identify
+each record in the table. In the \texttt{Dragon} table, the
+\texttt{"name"} column is its primary key that \emph{uniquely}
+identifies each entry in the table. Because \texttt{"name"} is the
+primary key of the table, no two entries in the table can have the same
+name -- a given value of \texttt{"name"} is unique to each dragon.
+Primary keys are used to ensure data integrity and to optimize data
+access.
+
+\subsection{Foreign Keys}\label{foreign-keys}
+
+A foreign key is a column or set of columns that references a
+\emph{primary key in another table}. A foreign key constraint ensures
+that a primary key exists in the referenced table. For example, let's
+say we have 2 tables, \texttt{student} and \texttt{assignment}, with the
+following schemas:
+
+\begin{verbatim}
+CREATE TABLE student (
+    student_id INTEGER PRIMARY KEY,
+    name VARCHAR,
+    email VARCHAR
+);
+
+CREATE TABLE assignment (
+    assignment_id INTEGER PRIMARY KEY,
+    description VARCHAR
+);
+\end{verbatim}
+
+Note that each table has a primary key that uniquely identifies each
+student and assignment.
+
+Say we want to create the table \texttt{grade} to store the score each
+student got on each assignment. Naturally, this will depend on the
+information in \texttt{student} and \texttt{assignment}; we should not
+be saving the grade for a nonexisistent student nor a nonexisistent
+assignment. Hence, we can create the columns \texttt{student\_id} and
+\texttt{assignment\_id} that reference foreign tables \texttt{student}
+and \texttt{assignment}, respectively. This way, we ensure that the data
+in \texttt{grade} is always up-to-date with the other tables.
+
+\begin{verbatim}
+CREATE TABLE grade (
+    student_id INTEGER,
+    assignment_id INTEGER,
+    score REAL,
+    FOREIGN KEY (student_id) REFERENCES student(student_id),
+    FOREIGN KEY (assignment_id) REFERENCES assignment(assignment_id)
+);
+\end{verbatim}
+
+\section{Basic Queries}\label{basic-queries}
+
+To extract and manipulate data stored in a SQL table, we will need to
+familiarize ourselves with the syntax to write pieces of SQL code, which
+we call \textbf{queries}.
+
+\subsection{\texorpdfstring{\texttt{SELECT}ing From
+Tables}{SELECTing From Tables}}\label{selecting-from-tables}
+
+The basic unit of a SQL query is the \texttt{SELECT} statement.
+\texttt{SELECT} specifies what columns we would like to extract from a
+given table. We use \texttt{FROM} to tell SQL the table from which we
+want to \texttt{SELECT} our data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*}
+\NormalTok{FROM Dragon}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+name & year & cute \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+In SQL, \texttt{*} means ``everything.'' The query above grabs
+\emph{all} the columns in \texttt{Dragon} and displays them in the
+outputted table. We can also specify a specific subset of columns to be
+\texttt{SELECT}ed.~Notice that the outputted columns appear in the order
+they were \texttt{SELECT}ed.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT cute, year}
+\NormalTok{FROM Dragon}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+cute & year \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+\textbf{Every} SQL query must include both a \texttt{SELECT} and
+\texttt{FROM} statement. Intuitively, this makes sense ------ we know
+that we'll want to extract some piece of information from the table; to
+do so, we also need to indicate what table we want to consider.
+
+It is important to note that SQL enforces a strict ``order of
+operations'' ------ SQL clauses must \emph{always} follow the same
+sequence. For example, the \texttt{SELECT} statement must always precede
+\texttt{FROM}. This means that any SQL query will follow the same
+structure.
+
+\begin{verbatim}
+SELECT <column list>
+FROM <table>
+[additional clauses]
+\end{verbatim}
+
+The additional clauses we use depend on the specific task we're trying
+to achieve. We may refine our query to filter on a certain condition,
+aggregate a particular column, or join several tables together. We will
+spend the rest of this note outlining some useful clauses to build up
+our understanding of the order of operations.
+
+\subsubsection{SQL Style Conventions}\label{sql-style-conventions}
+
+And just like that, we've already written two SQL queries. There are a
+few things to note in the queries above. Firstly, notice that every
+``verb'' is written in uppercase. It is convention to write SQL
+operations in capital letters, but your code will run just fine even if
+you choose to keep things in lowercase. Second, the query above
+separates each statement with a new line. SQL queries are not impacted
+by whitespace within the query; this means that SQL code is typically
+written with a new line after each statement to make things more
+readable. The semicolon (\texttt{;}) indicates the end of a query. There
+are some ``flavors'' of SQL in which a query will not run if no
+semicolon is present; however, in Data 100, the SQL version we will use
+works with or without an ending semicolon. Queries in these notes will
+end with semicolons to build up good habits.
+
+\subsubsection{\texorpdfstring{Aliasing with
+\texttt{AS}}{Aliasing with AS}}\label{aliasing-with-as}
+
+The \texttt{AS} keyword allows us to give a column a new name (called an
+\textbf{alias}) after it has been \texttt{SELECT}ed.~The general syntax
+is:
+
+\begin{verbatim}
+SELECT column_in_input_table AS new_name_in_output_table
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT cute AS cuteness, year AS birth}
+\NormalTok{FROM Dragon}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+cuteness & birth \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+\subsubsection{\texorpdfstring{Uniqueness with
+\texttt{DISTINCT}}{Uniqueness with DISTINCT}}\label{uniqueness-with-distinct}
+
+To \texttt{SELECT} only the \emph{unique} values in a column, we use the
+\texttt{DISTINCT} keyword. This will cause any any duplicate entries in
+a column to be removed. If we want to find only the unique years in
+\texttt{Dragon}, without any repeats, we would write:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT DISTINCT year}
+\NormalTok{FROM Dragon}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}l@{}}
+\toprule\noalign{}
+year \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+\subsection{\texorpdfstring{Applying \texttt{WHERE}
+Conditions}{Applying WHERE Conditions}}\label{applying-where-conditions}
+
+The \texttt{WHERE} keyword is used to select only some rows of a table,
+filtered on a given Boolean condition.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT name, year}
+\NormalTok{FROM Dragon}
+\NormalTok{WHERE cute }\OperatorTok{\textgreater{}} \DecValTok{0}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+name & year \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+We can add complexity to the \texttt{WHERE} condition using the keywords
+\texttt{AND}, \texttt{OR}, and \texttt{NOT}, much like we would in
+Python.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT name, year}
+\NormalTok{FROM Dragon}
+\NormalTok{WHERE cute }\OperatorTok{\textgreater{}} \DecValTok{0}\NormalTok{ OR year }\OperatorTok{\textgreater{}} \DecValTok{2013}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+name & year \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+To spare ourselves needing to write complicated logical expressions by
+combining several conditions, we can also filter for entries that are
+\texttt{IN} a specified list of values. This is similar to the use of
+\texttt{in} or \texttt{.isin} in Python.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT name, year}
+\NormalTok{FROM Dragon}
+\NormalTok{WHERE name IN (}\StringTok{\textquotesingle{}hiccup\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}puff\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+name & year \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+\subsubsection{Strings in SQL}\label{strings-in-sql}
+
+In \texttt{Python}, there is no distinction between double \texttt{""}
+and single quotes \texttt{\textquotesingle{}\textquotesingle{}}. SQL, on
+the other hand, distinguishes double quotes \texttt{""} as \emph{column
+names} and single quotes \texttt{\textquotesingle{}\textquotesingle{}}
+as \emph{strings}. For example, we can make the call
+
+\begin{verbatim}
+SELECT "birth weight"
+FROM patient
+WHERE "first name" = 'Joey'
+\end{verbatim}
+
+to select the column \texttt{"birth\ weight"} from the \texttt{patient}
+table and only select rows where the column \texttt{"first\ name"} is
+equal to \texttt{\textquotesingle{}Joey\textquotesingle{}}.
+
+\subsubsection{\texorpdfstring{\texttt{WHERE} WITH \texttt{NULL}
+Values}{WHERE WITH NULL Values}}\label{where-with-null-values}
+
+You may have noticed earlier that our table actually has a missing
+value. In SQL, missing data is given the special value \texttt{NULL}.
+\texttt{NULL} behaves in a fundamentally different way to other data
+types. We can't use the typical operators (=, \textgreater, and
+\textless) on \texttt{NULL} values (in fact, \texttt{NULL\ ==\ NULL}
+returns \texttt{False}!). Instead, we check to see if a value
+\texttt{IS} or \texttt{IS\ NOT} \texttt{NULL}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT name, cute}
+\NormalTok{FROM Dragon}
+\NormalTok{WHERE cute IS NOT NULL}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+name & cute \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+\subsection{Sorting and Restricting
+Output}\label{sorting-and-restricting-output}
+
+\subsubsection{\texorpdfstring{Sorting with
+\texttt{ORDER\ BY}}{Sorting with ORDER BY}}\label{sorting-with-order-by}
+
+What if we want the output table to appear in a certain order? The
+\texttt{ORDER\ BY} keyword behaves similarly to \texttt{.sort\_values()}
+in \texttt{pandas}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*}
+\NormalTok{FROM Dragon}
+\NormalTok{ORDER BY cute}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+name & year & cute \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+By default, \texttt{ORDER\ BY} will display results in ascending order
+(\texttt{ASC}) with the lowest values at the top of the table. To sort
+in descending order, we use the \texttt{DESC} keyword after specifying
+the column to be used for ordering.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*}
+\NormalTok{FROM Dragon}
+\NormalTok{ORDER BY cute DESC}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+name & year & cute \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+We can also tell SQL to \texttt{ORDER\ BY} two columns at once. This
+will sort the table by the first listed column, then use the values in
+the second listed column to break any ties.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*}
+\NormalTok{FROM Dragon}
+\NormalTok{ORDER BY year, cute DESC}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+name & year & cute \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+Note that in this example, \texttt{year} is sorted in ascending order
+and \texttt{cute} in descending order. If you want \texttt{year} to be
+ordered in descending order as well, you need to specify
+\texttt{year\ DESC,\ cute\ DESC;}.
+
+\subsubsection{\texorpdfstring{\texttt{LIMIT}
+vs.~\texttt{OFFSET}}{LIMIT vs.~OFFSET}}\label{limit-vs.-offset}
+
+In many instances, we are only concerned with a certain number of rows
+in the output table (for example, wanting to find the first two dragons
+in the table). The \texttt{LIMIT} keyword restricts the output to a
+specified number of rows. It serves a function similar to that of
+\texttt{.head()} in \texttt{pandas}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*}
+\NormalTok{FROM Dragon}
+\NormalTok{LIMIT }\DecValTok{2}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+name & year & cute \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+The \texttt{OFFSET} keyword indicates the index at which \texttt{LIMIT}
+should start. In other words, we can use \texttt{OFFSET} to shift where
+the \texttt{LIMIT}ing begins by a specified number of rows. For example,
+we might care about the dragons that are at positions 2 and 3 in the
+table.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*}
+\NormalTok{FROM Dragon}
+\NormalTok{LIMIT }\DecValTok{2}
+\NormalTok{OFFSET }\DecValTok{1}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/example_duck.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+name & year & cute \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+With these keywords in hand, let's update our SQL order of operations.
+Remember: \emph{every} SQL query must list clauses in this order.
+
+\begin{verbatim}
+SELECT <column expression list>
+FROM <table>
+[WHERE <predicate>]
+[ORDER BY <column list>]
+[LIMIT <number of rows>]
+[OFFSET <number of rows>];
+\end{verbatim}
+
+\section{Summary}\label{summary-3}
+
+Let's summarize what we've learned so far. We know that \texttt{SELECT}
+and \texttt{FROM} are the fundamental building blocks of any SQL query.
+We can augment these two keywords with additional clauses to refine the
+data in our output table.
+
+Any clauses that we include must follow a strict ordering within the
+query:
+
+\begin{verbatim}
+SELECT <column list>
+FROM <table>
+[WHERE <predicate>]
+[ORDER BY <column list>]
+[LIMIT <number of rows>]
+[OFFSET <number of rows>]
+\end{verbatim}
+
+Here, any clause contained in square brackets \texttt{{[}\ {]}} is
+optional ------ we only need to use the keyword if it is relevant to the
+table operation we want to perform. Also note that by convention, we use
+all caps for keywords in SQL statements and use newlines to make code
+more readable.
+
+\bookmarksetup{startatroot}
+
+\chapter{SQL II}\label{sql-ii}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Perform aggregations using \texttt{GROUP\ BY}
+\item
+  Introduce the ability to filter groups
+\item
+  Perform data cleaning and text manipulation in SQL
+\item
+  Join data across tables
+\end{itemize}
+
+\end{tcolorbox}
+
+In this lecture, we'll continue our work from last time to introduce
+some advanced SQL syntax.
+
+First, let's load in the \texttt{basic\_examples.db} database.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Load the SQL Alchemy Python library and DuckDB}
+\ImportTok{import}\NormalTok{ sqlalchemy}
+\ImportTok{import}\NormalTok{ duckdb}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Load \%\%sql cell magic}
+\OperatorTok{\%}\NormalTok{load\_ext sql}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Connect to the database}
+\OperatorTok{\%}\NormalTok{sql duckdb:}\OperatorTok{///}\NormalTok{data}\OperatorTok{/}\NormalTok{basic\_examples.db }\OperatorTok{{-}{-}}\NormalTok{alias basic}
+\end{Highlighting}
+\end{Shaded}
+
+\section{\texorpdfstring{Aggregating with
+\texttt{GROUP\ BY}}{Aggregating with GROUP BY}}\label{aggregating-with-group-by}
+
+At this point, we've seen that SQL offers much of the same functionality
+that was given to us by \texttt{pandas}. We can extract data from a
+table, filter it, and reorder it to suit our needs.
+
+In \texttt{pandas}, much of our analysis work relied heavily on being
+able to use \texttt{.groupby()} to aggregate across the rows of our
+dataset. SQL's answer to this task is the (very conveniently named)
+\texttt{GROUP\ BY} clause. While the outputs of \texttt{GROUP\ BY} are
+similar to those of \texttt{.groupby()} ------ in both cases, we obtain
+an output table where some column has been used for grouping ------ the
+syntax and logic used to group data in SQL are fairly different to the
+\texttt{pandas} implementation.
+
+To illustrate \texttt{GROUP\ BY}, we will consider the \texttt{Dish}
+table from our database.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*} 
+\NormalTok{FROM Dish}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+name & type & cost \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+Notice that there are multiple dishes of the same \texttt{type}. What if
+we wanted to find the total costs of dishes of a certain \texttt{type}?
+To accomplish this, we would write the following code.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\BuiltInTok{type}\NormalTok{, SUM(cost)}
+\NormalTok{FROM Dish}
+\NormalTok{GROUP BY }\BuiltInTok{type}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+type & sum("cost") \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+What is going on here? The statement \texttt{GROUP\ BY\ type} tells SQL
+to group the data based on the value contained in the \texttt{type}
+column (whether a record is an appetizer, entree, or dessert).
+\texttt{SUM(cost)} sums up the costs of dishes in each \texttt{type} and
+displays the result in the output table.
+
+You may be wondering: why does \texttt{SUM(cost)} come before the
+command to \texttt{GROUP\ BY\ type}? Don't we need to form groups before
+we can count the number of entries in each? Remember that SQL is a
+\emph{declarative} programming language ------ a SQL programmer simply
+states what end result they would like to see, and leaves the task of
+figuring out \emph{how} to obtain this result to SQL itself. This means
+that SQL queries sometimes don't follow what a reader sees as a
+``logical'' sequence of thought. Instead, SQL requires that we follow
+its set order of operations when constructing queries. So long as we
+follow this order, SQL will handle the underlying logic.
+
+In practical terms: our goal with this query was to output the total
+\texttt{cost}s of each \texttt{type}. To communicate this to SQL, we say
+that we want to \texttt{SELECT} the \texttt{SUM}med \texttt{cost} values
+for each \texttt{type} group.
+
+There are many aggregation functions that can be used to aggregate the
+data contained in each group. Some common examples are:
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{COUNT}: count the number of rows associated with each group
+\item
+  \texttt{MIN}: find the minimum value of each group
+\item
+  \texttt{MAX}: find the maximum value of each group
+\item
+  \texttt{SUM}: sum across all records in each group
+\item
+  \texttt{AVG}: find the average value of each group
+\end{itemize}
+
+We can easily compute multiple aggregations all at once (a task that was
+very tricky in \texttt{pandas}).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\BuiltInTok{type}\NormalTok{, SUM(cost), MIN(cost), MAX(name)}
+\NormalTok{FROM Dish}
+\NormalTok{GROUP BY }\BuiltInTok{type}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+type & sum("cost") & min("cost") & max("name") \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+To count the number of rows associated with each group, we use the
+\texttt{COUNT} keyword. Calling \texttt{COUNT(*)} will compute the total
+number of rows in each group, including rows with null values. Its
+\texttt{pandas} equivalent is \texttt{.groupby().size()}.
+
+Recall the \texttt{Dragon} table from the previous lecture:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\OperatorTok{*}\NormalTok{ FROM Dragon}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+name & year & cute \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+Notice that \texttt{COUNT(*)} and \texttt{COUNT(cute)} result in
+different outputs.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT year, COUNT(}\OperatorTok{*}\NormalTok{)}
+\NormalTok{FROM Dragon}
+\NormalTok{GROUP BY year}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+year & count\_star() \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT year, COUNT(cute)}
+\NormalTok{FROM Dragon}
+\NormalTok{GROUP BY year}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+year & count(cute) \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+With this definition of \texttt{GROUP\ BY} in hand, let's update our SQL
+order of operations. Remember: \emph{every} SQL query must list clauses
+in this order.
+
+\begin{verbatim}
+SELECT <column expression list>
+FROM <table>
+[WHERE <predicate>]
+[GROUP BY <column list>]
+[ORDER BY <column list>]
+[LIMIT <number of rows>]
+[OFFSET <number of rows>];
+\end{verbatim}
+
+Note that we can use the \texttt{AS} keyword to rename columns during
+the selection process and that column expressions may include
+aggregation functions (\texttt{MAX}, \texttt{MIN}, etc.).
+
+\section{Filtering Groups}\label{filtering-groups}
+
+Now, what if we only want groups that meet a certain condition?
+\texttt{HAVING} filters groups by applying some condition across all
+rows in each group. We interpret it as a way to keep only the groups
+\texttt{HAVING} some condition. Note the difference between
+\texttt{WHERE} and \texttt{HAVING}: we use \texttt{WHERE} to filter
+rows, whereas we use \texttt{HAVING} to filter \emph{groups}.
+\texttt{WHERE} precedes \texttt{HAVING} in terms of how SQL executes a
+query.
+
+Let's take a look at the \texttt{Dish} table to see how we can use
+\texttt{HAVING}. Say we want to group dishes with a cost greater than 4
+by \texttt{type} and only keep groups where the max cost is less than
+10.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT }\BuiltInTok{type}\NormalTok{, COUNT(}\OperatorTok{*}\NormalTok{)}
+\NormalTok{FROM Dish}
+\NormalTok{WHERE cost }\OperatorTok{\textgreater{}} \DecValTok{4}
+\NormalTok{GROUP BY }\BuiltInTok{type}
+\NormalTok{HAVING MAX(cost) }\OperatorTok{\textless{}}  \DecValTok{10}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+Done.
+\end{verbatim}
+
+\begin{longtable}[]{@{}ll@{}}
+\toprule\noalign{}
+type & count\_star() \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+\end{longtable}
+
+Here, we first use \texttt{WHERE} to filter for rows with a cost greater
+than 4. We then group our values by \texttt{type} before applying the
+\texttt{HAVING} operator. With \texttt{HAVING}, we can filter our groups
+based on if the max cost is less than 10.
+
+\section{Summary: SQL}\label{summary-sql}
+
+With this definition of \texttt{GROUP\ BY} and \texttt{HAVING} in hand,
+let's update our SQL order of operations. Remember: \emph{every} SQL
+query must list clauses in this order.
+
+\begin{verbatim}
+SELECT <column expression list>
+FROM <table>
+[WHERE <predicate>]
+[GROUP BY <column list>]
+[ORDER BY <column list>]
+[LIMIT <number of rows>]
+[OFFSET <number of rows>];
+\end{verbatim}
+
+Note that we can use the \texttt{AS} keyword to rename columns during
+the selection process and that column expressions may include
+aggregation functions (\texttt{MAX}, \texttt{MIN}, etc.).
+
+\section{EDA in SQL}\label{eda-in-sql}
+
+In the last lecture, we mostly worked under the assumption that our data
+had already been cleaned. However, as we saw in our first pass through
+the data science lifecycle, we're very unlikely to be given data that is
+free of formatting issues. With this in mind, we'll want to learn how to
+clean and transform data in SQL.
+
+Our typical workflow when working with ``big data'' is:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Use SQL to query data from a database
+\item
+  Use Python (with \texttt{pandas}) to analyze this data in detail
+\end{enumerate}
+
+We can, however, still perform simple data cleaning and re-structuring
+using SQL directly. To do so, we'll use the \texttt{Title} table from
+the \texttt{imdb\_duck} database, which contains information about
+movies and actors.
+
+Let's load in the \texttt{imdb\_duck} database.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ os}
+\NormalTok{os.environ[}\StringTok{"TQDM\_DISABLE"}\NormalTok{] }\OperatorTok{=} \StringTok{"1"}
+\ControlFlowTok{if}\NormalTok{ os.path.exists(}\StringTok{"/home/jovyan/shared/sql/imdb\_duck.db"}\NormalTok{):}
+\NormalTok{    imdbpath }\OperatorTok{=} \StringTok{"duckdb:////home/jovyan/shared/sql/imdb\_duck.db"}
+\ControlFlowTok{elif}\NormalTok{ os.path.exists(}\StringTok{"data/imdb\_duck.db"}\NormalTok{):}
+\NormalTok{    imdbpath }\OperatorTok{=}  \StringTok{"duckdb:///data/imdb\_duck.db"}
+\ControlFlowTok{else}\NormalTok{:}
+    \ImportTok{import}\NormalTok{ gdown}
+\NormalTok{    url }\OperatorTok{=} \StringTok{\textquotesingle{}https://drive.google.com/uc?id=10tKOHGLt9QoOgq5Ii{-}FhxpB9lDSQgl1O\textquotesingle{}}
+\NormalTok{    output\_path }\OperatorTok{=} \StringTok{\textquotesingle{}data/imdb\_duck.db\textquotesingle{}}
+\NormalTok{    gdown.download(url, output\_path, quiet}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+\NormalTok{    imdbpath }\OperatorTok{=} \StringTok{"duckdb:///data/imdb\_duck.db"}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ sqlalchemy }\ImportTok{import}\NormalTok{ create\_engine}
+\NormalTok{imdb\_engine }\OperatorTok{=}\NormalTok{ create\_engine(imdbpath, connect\_args}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}read\_only\textquotesingle{}}\NormalTok{: }\VariableTok{True}\NormalTok{\})}
+\OperatorTok{\%}\NormalTok{sql imdb\_engine }\OperatorTok{{-}{-}}\NormalTok{alias imdb}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.ParserException) Parser Error: syntax error at or near "imdb_engine"
+[SQL: imdb_engine]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+Since we'll be working with the \texttt{Title} table, let's take a quick
+look at what it contains.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql imdb }
+    
+\NormalTok{SELECT }\OperatorTok{*}
+\NormalTok{FROM Title}
+\NormalTok{WHERE primaryTitle IN (}\StringTok{\textquotesingle{}Ginny \& Georgia\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}What If...?\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Succession\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Veep\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Tenet\textquotesingle{}}\NormalTok{)}
+\NormalTok{LIMIT }\DecValTok{10}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.ParserException) Parser Error: syntax error at or near "imdb"
+[SQL: imdb
+    
+SELECT *
+FROM Title
+WHERE primaryTitle IN ('Ginny & Georgia', 'What If...?', 'Succession', 'Veep', 'Tenet')
+LIMIT 10;]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+\subsection{\texorpdfstring{Matching Text using
+\texttt{LIKE}}{Matching Text using LIKE}}\label{matching-text-using-like}
+
+One common task we encountered in our first look at EDA was needing to
+match string data. For example, we might want to remove entries
+beginning with the same prefix as part of the data cleaning process.
+
+In SQL, we use the \texttt{LIKE} operator to (you guessed it) look for
+strings that are \emph{like} a given string pattern.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT titleType, primaryTitle}
+\NormalTok{FROM Title}
+\NormalTok{WHERE primaryTitle LIKE }\StringTok{\textquotesingle{}Star Wars: Episode I {-} The Phantom Menace\textquotesingle{}}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "temp.information_schema.tables"?
+LINE 2: FROM Title
+             ^
+[SQL: SELECT titleType, primaryTitle
+FROM Title
+WHERE primaryTitle LIKE 'Star Wars: Episode I - The Phantom Menace']
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+What if we wanted to find \emph{all} Star Wars movies? \texttt{\%} is
+the wildcard operator, it means ``look for any character, any number of
+times''. This makes it helpful for identifying strings that are similar
+to our desired pattern, even when we don't know the full text of what we
+aim to extract.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT titleType, primaryTitle}
+\NormalTok{FROM Title}
+\NormalTok{WHERE primaryTitle LIKE }\StringTok{\textquotesingle{}\%Star Wars\%\textquotesingle{}}
+\NormalTok{LIMIT }\DecValTok{10}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "temp.information_schema.tables"?
+LINE 2: FROM Title
+             ^
+[SQL: SELECT titleType, primaryTitle
+FROM Title
+WHERE primaryTitle LIKE '%Star Wars%'
+LIMIT 10;]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+Alternatively, we can use RegEx! DuckDB and most real DBMSs allow for
+this. Note that here, we have to use the \texttt{SIMILAR\ TO} operater
+rather than \texttt{LIKE}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT titleType, primaryTitle}
+\NormalTok{FROM Title}
+\NormalTok{WHERE primaryTitle SIMILAR TO }\StringTok{\textquotesingle{}.*Star Wars*.\textquotesingle{}}
+\NormalTok{LIMIT }\DecValTok{10}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "temp.information_schema.tables"?
+LINE 2: FROM Title
+             ^
+[SQL: SELECT titleType, primaryTitle
+FROM Title
+WHERE primaryTitle SIMILAR TO '.*Star Wars*.'
+LIMIT 10;]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+\subsection{\texorpdfstring{\texttt{CAST}ing Data
+Types}{CASTing Data Types}}\label{casting-data-types}
+
+A common data cleaning task is converting data to the correct variable
+type. The \texttt{CAST} keyword is used to generate a new output column.
+Each entry in this output column is the result of converting the data in
+an existing column to a new data type. For example, we may wish to
+convert numeric data stored as a string to an integer.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT primaryTitle, CAST(runtimeMinutes AS INT)}
+\NormalTok{FROM Title}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "temp.information_schema.tables"?
+LINE 2: FROM Title;
+             ^
+[SQL: SELECT primaryTitle, CAST(runtimeMinutes AS INT)
+FROM Title;]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+We use \texttt{CAST} when \texttt{SELECT}ing colunns for our output
+table. In the example above, we want to \texttt{SELECT} the columns of
+integer year and runtime data that is created by the \texttt{CAST}.
+
+SQL will automatically name a new column according to the command used
+to \texttt{SELECT} it, which can lead to unwieldy column names. We can
+rename the \texttt{CAST}ed column using the \texttt{AS} keyword.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{SELECT primaryTitle AS title, CAST(runtimeMinutes AS INT) AS minutes, CAST(startYear AS INT) AS year}
+\NormalTok{FROM Title}
+\NormalTok{LIMIT }\DecValTok{5}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "temp.information_schema.tables"?
+LINE 2: FROM Title
+             ^
+[SQL: SELECT primaryTitle AS title, CAST(runtimeMinutes AS INT) AS minutes, CAST(startYear AS INT) AS year
+FROM Title
+LIMIT 5;]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+\subsection{\texorpdfstring{Using Conditional Statements with
+\texttt{CASE}}{Using Conditional Statements with CASE}}\label{using-conditional-statements-with-case}
+
+When working with \texttt{pandas}, we often ran into situations where we
+wanted to generate new columns using some form of conditional statement.
+For example, say we wanted to describe a film title as ``old,''
+``mid-aged,'' or ``new,'' depending on the year of its release.
+
+In SQL, conditional operations are performed using a \texttt{CASE}
+clause. Conceptually, \texttt{CASE} behaves much like the \texttt{CAST}
+operation: it creates a new column that we can then \texttt{SELECT} to
+appear in the output. The syntax for a \texttt{CASE} clause is as
+follows:
+
+\begin{verbatim}
+CASE WHEN <condition> THEN <value>
+     WHEN <other condition> THEN <other value>
+     ...
+     ELSE <yet another value>
+     END
+\end{verbatim}
+
+Scanning through the skeleton code above, you can see that the logic is
+similar to that of an \texttt{if} statement in Python. The conditional
+statement is first opened by calling \texttt{CASE}. Each new condition
+is specified by \texttt{WHEN}, with \texttt{THEN} indicating what value
+should be filled if the condition is met. \texttt{ELSE} specifies the
+value that should be filled if no other conditions are met. Lastly,
+\texttt{END} indicates the end of the conditional statement; once
+\texttt{END} has been called, SQL will continue evaluating the query as
+usual.
+
+Let's see this in action. In the example below, we give the new column
+created by the \texttt{CASE} statement the name \texttt{movie\_age}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\OperatorTok{/*}\NormalTok{ If a movie was filmed before }\DecValTok{1950}\NormalTok{, it }\KeywordTok{is} \StringTok{"old"}
+\NormalTok{Otherwise, }\ControlFlowTok{if}\NormalTok{ a movie was filmed before }\DecValTok{2000}\NormalTok{, it }\KeywordTok{is} \StringTok{"mid{-}aged"}
+\NormalTok{Else, a movie }\KeywordTok{is} \StringTok{"new"} \OperatorTok{*/}
+
+\NormalTok{SELECT titleType, startYear,}
+\NormalTok{CASE WHEN startYear }\OperatorTok{\textless{}} \DecValTok{1950}\NormalTok{ THEN }\StringTok{\textquotesingle{}old\textquotesingle{}}
+\NormalTok{     WHEN startYear }\OperatorTok{\textless{}} \DecValTok{2000}\NormalTok{ THEN }\StringTok{\textquotesingle{}mid{-}aged\textquotesingle{}}
+\NormalTok{     ELSE }\StringTok{\textquotesingle{}new\textquotesingle{}}
+\NormalTok{     END AS movie\_age}
+\NormalTok{FROM Title}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "temp.information_schema.tables"?
+LINE 10: FROM Title;
+              ^
+[SQL: /* If a movie was filmed before 1950, it is "old"
+Otherwise, if a movie was filmed before 2000, it is "mid-aged"
+Else, a movie is "new" */
+
+SELECT titleType, startYear,
+CASE WHEN startYear < 1950 THEN 'old'
+     WHEN startYear < 2000 THEN 'mid-aged'
+     ELSE 'new'
+     END AS movie_age
+FROM Title;]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+\section{\texorpdfstring{\texttt{JOIN}ing
+Tables}{JOINing Tables}}\label{joining-tables-1}
+
+At this point, we're well-versed in using SQL as a tool to clean,
+manipulate, and transform data in a table. Notice that this sentence
+referred to one \emph{table}, specifically. What happens if the data we
+need is distributed across multiple tables? This is an important
+consideration when using SQL ------ recall that we first introduced SQL
+as a language to query from databases. Databases often store data in a
+multidimensional structure. In other words, information is stored across
+several tables, with each table containing a small subset of all the
+data housed by the database.
+
+A common way of organizing a database is by using a \textbf{star
+schema}. A star schema is composed of two types of tables. A
+\textbf{fact table} is the central table of the database ------ it
+contains the information needed to link entries across several
+\textbf{dimension tables}, which contain more detailed information about
+the data.
+
+Say we were working with a database about boba offerings in Berkeley.
+The dimension tables of the database might contain information about tea
+varieties and boba toppings. The fact table would be used to link this
+information across the various dimension tables.
+
+If we explicitly mark the relationships between tables, we start to see
+the star-like structure of the star schema.
+
+To join data across multiple tables, we'll use the (creatively named)
+\texttt{JOIN} keyword. We'll make things easier for now by first
+considering the simpler \texttt{cats} dataset, which consists of the
+tables \texttt{s} and \texttt{t}.
+
+To perform a join, we amend the \texttt{FROM} clause. You can think of
+this as saying, ``\texttt{SELECT} my data \texttt{FROM} tables that have
+been \texttt{JOIN}ed together.''
+
+Remember: SQL does not consider newlines or whitespace when interpreting
+queries. The indentation given in the example below is to help improve
+readability. If you wish, you can write code that does not follow this
+formatting.
+
+\begin{verbatim}
+SELECT <column list>
+FROM table_1 
+    JOIN table_2 
+    ON key_1 = key_2;
+\end{verbatim}
+
+We also need to specify what column from each table should be used to
+determine matching entries. By defining these keys, we provide SQL with
+the information it needs to pair rows of data together.
+
+The most commonly used type of SQL \texttt{JOIN} is the \textbf{inner
+join}. It turns out you're already familiar with what an inner join
+does, and how it works -- this is the type of join we've been using in
+\texttt{pandas} all along! In an inner join, we combine every row in our
+first table with its matching entry in the second table. If a row from
+either table does not have a match in the other table, it is omitted
+from the output.
+
+In a \textbf{cross join}, \emph{all} possible combinations of rows
+appear in the output table, regardless of whether or not rows share a
+matching key. Because all rows are joined, even if there is no matching
+key, it is not necessary to specify what keys to consider in an
+\texttt{ON} statement. A cross join is also known as a cartesian
+product.
+
+Conceptually, we can interpret an inner join as a cross join, followed
+by removing all rows that do not share a matching key. Notice that the
+output of the inner join above contains all rows of the cross join
+example that contain a single color across the entire row.
+
+In a \textbf{left outer join}, \emph{all} rows in the left table are
+kept in the output table. If a row in the right table shares a match
+with the left table, this row will be kept; otherwise, the rows in the
+right table are omitted from the output. We can fill in any missing
+values with \texttt{NULL}.
+
+A \textbf{right outer join} keeps all rows in the right table. Rows in
+the left table are only kept if they share a match in the right table.
+Again, we can fill in any missing values with \texttt{NULL}.
+
+In a \textbf{full outer join}, all rows that have a match between the
+two tables are joined together. If a row has no match in the second
+table, then the values of the columns for that second table are filled
+with \texttt{NULL}. In other words, a full outer join performs an inner
+join \emph{while still keeping} rows that have no match in the other
+table. This is best understood visually:
+
+We have kept the same output achieved using an inner join, with the
+addition of partially null rows for entries in \texttt{s} and \texttt{t}
+that had no match in the second table.
+
+\subsection{\texorpdfstring{Aliasing in
+\texttt{JOIN}s}{Aliasing in JOINs}}\label{aliasing-in-joins}
+
+When joining tables, we often create aliases for table names (similarly
+to what we did with column names in the last lecture). We do this as it
+is typically easier to refer to aliases, especially when we are working
+with long table names. We can even reference columns using aliased table
+names!
+
+Let's say we want to determine the average rating of various movies.
+We'll need to \texttt{JOIN} the \texttt{Title} and \texttt{Rating}
+tables and can create aliases for both tables.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+
+\NormalTok{SELECT primaryTitle, averageRating}
+\NormalTok{FROM Title AS T INNER JOIN Rating AS R}
+\NormalTok{ON T.tconst }\OperatorTok{=}\NormalTok{ R.tconst}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "temp.information_schema.tables"?
+LINE 2: FROM Title AS T INNER JOIN Rating AS R
+             ^
+[SQL: SELECT primaryTitle, averageRating
+FROM Title AS T INNER JOIN Rating AS R
+ON T.tconst = R.tconst;]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+Note that the \texttt{AS} is actually optional! We can create aliases
+for our tables even without it, but we usually include it for clarity.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+
+\NormalTok{SELECT primaryTitle, averageRating}
+\NormalTok{FROM Title T INNER JOIN Rating R}
+\NormalTok{ON T.tconst }\OperatorTok{=}\NormalTok{ R.tconst}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "temp.information_schema.tables"?
+LINE 2: FROM Title T INNER JOIN Rating R
+             ^
+[SQL: SELECT primaryTitle, averageRating
+FROM Title T INNER JOIN Rating R
+ON T.tconst = R.tconst;]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+\subsection{Common Table Expressions}\label{common-table-expressions}
+
+For more sophisticated data problems, the queries can become very
+complex. Common table expressions (CTEs) allow us to break down these
+complex queries into more manageable parts. To do so, we create
+temporary tables corresponding to different aspects of the problem and
+then reference them in the final query:
+
+\begin{verbatim}
+WITH 
+table_name1 AS ( 
+    SELECT ...
+),
+table_name2 AS ( 
+    SELECT ...
+)
+SELECT ... 
+FROM 
+table_name1, 
+table_name2, ...
+\end{verbatim}
+
+Let's say we want to identify the top 10 action movies that are highly
+rated (with an average rating greater than 7) and popular (having more
+than 5000 votes), along with the primary actors who are the most
+popular. We can use CTEs to break this query down into separate
+problems. Initially, we can filter to find good action movies and
+prolific actors separately. This way, in our final join, we only need to
+change the order.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\OperatorTok{\%\%}\NormalTok{sql}
+\NormalTok{WITH }
+\NormalTok{good\_action\_movies AS (}
+\NormalTok{    SELECT }\OperatorTok{*}
+\NormalTok{    FROM Title T JOIN Rating R ON T.tconst }\OperatorTok{=}\NormalTok{ R.tconst  }
+\NormalTok{    WHERE genres LIKE }\StringTok{\textquotesingle{}\%Action\%\textquotesingle{}}\NormalTok{ AND averageRating }\OperatorTok{\textgreater{}} \DecValTok{7}\NormalTok{ AND numVotes }\OperatorTok{\textgreater{}} \DecValTok{5000}
+\NormalTok{),}
+\NormalTok{prolific\_actors AS (}
+\NormalTok{    SELECT N.nconst, primaryName, COUNT(}\OperatorTok{*}\NormalTok{) }\ImportTok{as}\NormalTok{ numRoles}
+\NormalTok{    FROM Name N JOIN Principal P ON N.nconst }\OperatorTok{=}\NormalTok{ P.nconst}
+\NormalTok{    WHERE category }\OperatorTok{=} \StringTok{\textquotesingle{}actor\textquotesingle{}}
+\NormalTok{    GROUP BY N.nconst, primaryName}
+\NormalTok{)}
+\NormalTok{SELECT primaryTitle, primaryName, numRoles, ROUND(averageRating) AS rating}
+\NormalTok{FROM good\_action\_movies m, prolific\_actors a, principal p}
+\NormalTok{WHERE p.tconst }\OperatorTok{=}\NormalTok{ m.tconst AND p.nconst }\OperatorTok{=}\NormalTok{ a.nconst}
+\NormalTok{ORDER BY rating DESC, numRoles DESC}
+\NormalTok{LIMIT }\DecValTok{10}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+ * duckdb:///data/basic_examples.db
+(duckdb.duckdb.CatalogException) Catalog Error: Table with name Title does not exist!
+Did you mean "temp.information_schema.tables"?
+LINE 4:     F...
+                 ^
+[SQL: WITH 
+good_action_movies AS (
+    SELECT *
+    FROM Title T JOIN Rating R ON T.tconst = R.tconst  
+    WHERE genres LIKE '%Action%' AND averageRating > 7 AND numVotes > 5000
+),
+prolific_actors AS (
+    SELECT N.nconst, primaryName, COUNT(*) as numRoles
+    FROM Name N JOIN Principal P ON N.nconst = P.nconst
+    WHERE category = 'actor'
+    GROUP BY N.nconst, primaryName
+)
+SELECT primaryTitle, primaryName, numRoles, ROUND(averageRating) AS rating
+FROM good_action_movies m, prolific_actors a, principal p
+WHERE p.tconst = m.tconst AND p.nconst = a.nconst
+ORDER BY rating DESC, numRoles DESC
+LIMIT 10;]
+(Background on this error at: https://sqlalche.me/e/20/f405)
+\end{verbatim}
+
+\bookmarksetup{startatroot}
+
+\chapter{Logistic Regression I}\label{logistic-regression-i}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Understand the difference between regression and classification
+\item
+  Derive the logistic regression model for classifying data
+\item
+  Quantify the error of our logistic regression model with cross-entropy
+  loss
+\end{itemize}
+
+\end{tcolorbox}
+
+Up until this point in the class , we've focused on \textbf{regression}
+tasks - that is, predicting an \emph{unbounded numerical quantity} from
+a given dataset. We discussed optimization, feature engineering, and
+regularization all in the context of performing regression to predict
+some quantity.
+
+Now that we have this deep understanding of the modeling process, let's
+expand our knowledge of possible modeling tasks.
+
+\section{Classification}\label{classification}
+
+In the next two lectures, we'll tackle the task of
+\textbf{classification}. A classification problem aims to classify data
+into \emph{categories}. Unlike in regression, where we predicted a
+numeric output, classification involves predicting some
+\textbf{categorical variable}, or \textbf{response}, \(y\). Examples of
+classification tasks include:
+
+\begin{itemize}
+\tightlist
+\item
+  Predicting which team won from its turnover percentage
+\item
+  Predicting the day of the week of a meal from the total restaurant
+  bill
+\item
+  Predicting the model of car from its horsepower
+\end{itemize}
+
+There are a couple of different types of classification:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Binary classification}: classify data into two classes, and
+  responses \(y\) are either 0 or 1
+\item
+  \textbf{Multiclass classification}: classify data into multiple
+  classes (e.g., image labeling, next word in a sentence, etc.)
+\end{itemize}
+
+We can further combine multiple related classfication predictions (e.g.,
+translation, voice recognition, etc.) to tackle complex problems through
+structured prediction tasks.
+
+In Data 100, we will mostly deal with \textbf{binary classification},
+where we are attempting to classify data into one of two classes.
+
+\subsection{Modeling Process}\label{modeling-process}
+
+To build a classification model, we need to modify our modeling workflow
+slightly. Recall that in regression we:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Created a design matrix of numeric features
+\item
+  Defined our model as a linear combination of these numeric features
+\item
+  Used the model to output numeric predictions
+\end{enumerate}
+
+In classification, however, we no longer want to output numeric
+predictions; instead, we want to predict the class to which a datapoint
+belongs. This means that we need to update our workflow. To build a
+classification model, we will:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Create a design matrix of numeric features.
+\item
+  Define our model as a linear combination of these numeric features,
+  transformed by a non-linear \textbf{sigmoid function}. This outputs a
+  numeric quantity.
+\item
+  Apply a \textbf{decision rule} to interpret the outputted quantity and
+  decide a classification.
+\item
+  Output a predicted class.
+\end{enumerate}
+
+There are two key differences: as we'll soon see, we need to incorporate
+a non-linear transformation to capture the non-linear relationships
+hidden in our data. We do so by applying the sigmoid function to a
+linear combination of the features. Secondly, we must apply a decision
+rule to convert the numeric quantities computed by our model into an
+actual class prediction. This can be as simple as saying that any
+datapoint with a feature greater than some number \(x\) belongs to Class
+1.
+
+\textbf{Regression:}
+
+\textbf{Classification:}
+
+This was a very high-level overview. Let's walk through the process in
+detail to clarify what we mean.
+
+\section{Deriving the Logistic Regression
+Model}\label{deriving-the-logistic-regression-model}
+
+Throughout this lecture, we will work with the \texttt{games} dataset,
+which contains information about games played in the NBA basketball
+league. Our goal will be to use a basketball team's
+\texttt{"GOAL\_DIFF"} to predict whether or not a given team won their
+game (\texttt{"WON"}). If a team wins their game, we'll say they belong
+to Class 1. If they lose, they belong to Class 0.
+
+For those who are curious, \texttt{"GOAL\_DIFF"} represents the
+difference in successful field goal percentages between the two
+competing teams.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ warnings}
+\NormalTok{warnings.filterwarnings(}\StringTok{"ignore"}\NormalTok{)}
+
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\NormalTok{np.seterr(divide}\OperatorTok{=}\StringTok{\textquotesingle{}ignore\textquotesingle{}}\NormalTok{)}
+
+\NormalTok{games }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/games"}\NormalTok{).dropna()}
+\NormalTok{games.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllllll@{}}
+\toprule\noalign{}
+& GAME\_ID & TEAM\_NAME & MATCHUP & WON & GOAL\_DIFF & AST \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 21701216 & Dallas Mavericks & DAL vs. PHX & 0 & -0.251 & 20 \\
+1 & 21700846 & Phoenix Suns & PHX @ GSW & 0 & -0.237 & 13 \\
+2 & 21700071 & San Antonio Spurs & SAS @ ORL & 0 & -0.234 & 19 \\
+3 & 21700221 & New York Knicks & NYK @ TOR & 0 & -0.234 & 17 \\
+4 & 21700306 & Miami Heat & MIA @ NYK & 0 & -0.222 & 21 \\
+\end{longtable}
+
+Let's visualize the relationship between \texttt{"GOAL\_DIFF"} and
+\texttt{"WON"} using the Seaborn function \texttt{sns.stripplot}. A
+strip plot automatically introduces a small amount of random noise to
+\textbf{jitter} the data. Recall that all values in the \texttt{"WON"}
+column are either 1 (won) or 0 (lost) -- if we were to directly plot
+them without jittering, we would see severe overplotting.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+
+\NormalTok{sns.stripplot(data}\OperatorTok{=}\NormalTok{games, x}\OperatorTok{=}\StringTok{"GOAL\_DIFF"}\NormalTok{, y}\OperatorTok{=}\StringTok{"WON"}\NormalTok{, orient}\OperatorTok{=}\StringTok{"h"}\NormalTok{, hue}\OperatorTok{=}\StringTok{\textquotesingle{}WON\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.7}\NormalTok{)}
+\CommentTok{\# By default, sns.stripplot plots 0, then 1. We invert the y axis to reverse this behavior}
+\NormalTok{plt.gca().invert\_yaxis()}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{logistic_regression_1/logistic_reg_1_files/figure-pdf/cell-3-output-1.pdf}
+
+This dataset is unlike anything we've seen before -- our target variable
+contains only two unique values! (Remember that each y value is either 0
+or 1; the plot above jitters the y data slightly for ease of reading.)
+
+The regression models we have worked with always assumed that we were
+attempting to predict a continuous target. If we apply a linear
+regression model to this dataset, something strange happens.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ sklearn.linear\_model }\ImportTok{as}\NormalTok{ lm}
+
+\NormalTok{X, Y }\OperatorTok{=}\NormalTok{ games[[}\StringTok{"GOAL\_DIFF"}\NormalTok{]], games[}\StringTok{"WON"}\NormalTok{]}
+\NormalTok{regression\_model }\OperatorTok{=}\NormalTok{ lm.LinearRegression()}
+\NormalTok{regression\_model.fit(X, Y)}
+
+\NormalTok{plt.plot(X.squeeze(), regression\_model.predict(X), }\StringTok{"k"}\NormalTok{)}
+\NormalTok{sns.stripplot(data}\OperatorTok{=}\NormalTok{games, x}\OperatorTok{=}\StringTok{"GOAL\_DIFF"}\NormalTok{, y}\OperatorTok{=}\StringTok{"WON"}\NormalTok{, orient}\OperatorTok{=}\StringTok{"h"}\NormalTok{, hue}\OperatorTok{=}\StringTok{\textquotesingle{}WON\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.7}\NormalTok{)}
+\NormalTok{plt.gca().invert\_yaxis()}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{logistic_regression_1/logistic_reg_1_files/figure-pdf/cell-4-output-1.pdf}
+
+The linear regression fit follows the data as closely as it can.
+However, this approach has a key flaw - the predicted output,
+\(\hat{y}\), can be outside the range of possible classes (there are
+predictions above 1 and below 0). This means that the output can't
+always be interpreted (what does it mean to predict a class of -2.3?).
+
+Our usual linear regression framework won't work here. Instead, we'll
+need to get more creative.
+
+\subsection{Graph of Averages}\label{graph-of-averages}
+
+Back in
+\href{https://inferentialthinking.com/chapters/08/1/Applying_a_Function_to_a_Column.html\#example-prediction}{Data
+8}, you gradually built up to the concept of linear regression by using
+the \textbf{graph of averages}. Before you knew the mathematical
+underpinnings of the regression line, you took a more intuitive
+approach: you bucketed the \(x\) data into bins of common values, then
+computed the average \(y\) for all datapoints in the same bin. The
+result gave you the insight needed to derive the regression fit.
+
+Let's take the same approach as we grapple with our new classification
+task. In the cell below, we 1) bucket the \texttt{"GOAL\_DIFF"} data
+into bins of similar values and 2) compute the average \texttt{"WON"}
+value of all datapoints in a bin.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# bucket the GOAL\_DIFF data into 20 bins}
+\NormalTok{bins }\OperatorTok{=}\NormalTok{ pd.cut(games[}\StringTok{"GOAL\_DIFF"}\NormalTok{], }\DecValTok{20}\NormalTok{)}
+\NormalTok{games[}\StringTok{"bin"}\NormalTok{] }\OperatorTok{=}\NormalTok{ [(b.left }\OperatorTok{+}\NormalTok{ b.right) }\OperatorTok{/} \DecValTok{2} \ControlFlowTok{for}\NormalTok{ b }\KeywordTok{in}\NormalTok{ bins]}
+\NormalTok{win\_rates\_by\_bin }\OperatorTok{=}\NormalTok{ games.groupby(}\StringTok{"bin"}\NormalTok{)[}\StringTok{"WON"}\NormalTok{].mean()}
+
+\CommentTok{\# plot the graph of averages}
+\NormalTok{sns.stripplot(data}\OperatorTok{=}\NormalTok{games, x}\OperatorTok{=}\StringTok{"GOAL\_DIFF"}\NormalTok{, y}\OperatorTok{=}\StringTok{"WON"}\NormalTok{, orient}\OperatorTok{=}\StringTok{"h"}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{, hue}\OperatorTok{=}\StringTok{\textquotesingle{}WON\textquotesingle{}}\NormalTok{) }\CommentTok{\# alpha makes the points transparent}
+\NormalTok{plt.plot(win\_rates\_by\_bin.index, win\_rates\_by\_bin, c}\OperatorTok{=}\StringTok{"tab:red"}\NormalTok{)}
+\NormalTok{plt.gca().invert\_yaxis()}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{logistic_regression_1/logistic_reg_1_files/figure-pdf/cell-5-output-1.pdf}
+
+Interesting: our result is certainly not like the straight line produced
+by finding the graph of averages for a linear relationship. We can make
+two observations:
+
+\begin{itemize}
+\tightlist
+\item
+  All predictions on our line are between 0 and 1
+\item
+  The predictions are \textbf{non-linear}, following a rough ``S'' shape
+\end{itemize}
+
+Let's think more about what we've just done.
+
+To find the average \(y\) value for each bin, we computed:
+
+\[\frac{1 \text{(\# Y = 1 in bin)} + 0 \text{(\# Y = 0 in bin)}}{\text{\# datapoints in bin}} = \frac{\text{\# Y = 1 in bin}}{\text{\# datapoints in bin}} = P(\text{Y = 1} | \text{bin})\]
+
+This is simply the probability of a datapoint in that bin belonging to
+Class 1! This aligns with our observation from earlier: all of our
+predictions lie between 0 and 1, just as we would expect for a
+probability.
+
+Our graph of averages was really modeling the probability, \(p\), that a
+datapoint belongs to Class 1, or essentially that \(\text{Y = 1}\) for a
+particular value of \(\text{x}\).
+
+\[ p = P(Y = 1 | \text{ x} )\]
+
+In logistic regression, we have a new modeling goal. We want to model
+the \textbf{probability that a particular datapoint belongs to Class 1}
+by approximating the S-shaped curve we plotted above. However, we've
+only learned about linear modeling techniques like Linear Regression and
+OLS.
+
+\subsection{Handling Non-Linear
+Output}\label{handling-non-linear-output}
+
+Fortunately for us, we're already well-versed with a technique to model
+non-linear relationships -- we can apply non-linear transformations like
+log or exponents to make a non-linear relationship more linear. Recall
+the steps we've applied previously:
+
+\begin{itemize}
+\tightlist
+\item
+  Transform the variables until we linearize their relationship
+\item
+  Fit a linear model to the transformed variables
+\item
+  ``Undo'' our transformations to identify the underlying relationship
+  between the original variables
+\end{itemize}
+
+In past examples, we used the bulge diagram to help us decide what
+transformations may be useful. The S-shaped curve we saw above, however,
+looks nothing like any relationship we've seen in the past. We'll need
+to think carefully about what transformations will linearize this curve.
+
+\subsubsection{1. Odds}\label{odds}
+
+Let's consider our eventual goal: determining if we should predict a
+Class of 0 or 1 for each datapoint. Rephrased, we want to decide if it
+seems more ``likely'' that the datapoint belongs to Class 0 or to Class
+1. One way of deciding this is to see which class has the higher
+predicted probability for a given datapoint. The \textbf{odds} is
+defined as the probability of a datapoint belonging to Class 1 divided
+by the probability of it belonging to Class 0.
+
+\[\text{odds} = \frac{P(Y=1|x)}{P(Y=0|x)} = \frac{p}{1-p}\]
+
+If we plot the odds for each input \texttt{"GOAL\_DIFF"} (\(x\)), we see
+something that looks more promising.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{p }\OperatorTok{=}\NormalTok{ win\_rates\_by\_bin}
+\NormalTok{odds }\OperatorTok{=}\NormalTok{ p}\OperatorTok{/}\NormalTok{(}\DecValTok{1}\OperatorTok{{-}}\NormalTok{p) }
+
+\NormalTok{plt.plot(odds.index, odds)}
+\NormalTok{plt.xlabel(}\StringTok{"x"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\VerbatimStringTok{r"Odds $= \textbackslash{}frac}\SpecialCharTok{\{p\}}\VerbatimStringTok{\{1{-}p\}$"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{logistic_regression_1/logistic_reg_1_files/figure-pdf/cell-6-output-1.pdf}
+
+\subsubsection{2. Log}\label{log}
+
+It turns out that the relationship between our input
+\texttt{"GOAL\_DIFF"} and the odds is roughly exponential! Let's
+linearize the exponential by taking the logarithm (as suggested by the
+\href{https://ds100.org/course-notes/visualization_2/visualization_2.html\#tukey-mosteller-bulge-diagram}{Tukey-Mosteller
+Bulge Diagram}). As a reminder, you should assume that any logarithm in
+Data 100 is the base \(e\) natural logarithm unless told otherwise.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\NormalTok{log\_odds }\OperatorTok{=}\NormalTok{ np.log(odds)}
+\NormalTok{plt.plot(odds.index, log\_odds, c}\OperatorTok{=}\StringTok{"tab:green"}\NormalTok{)}
+\NormalTok{plt.xlabel(}\StringTok{"x"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\VerbatimStringTok{r"Log{-}Odds $= \textbackslash{}log\{\textbackslash{}frac}\SpecialCharTok{\{p\}}\VerbatimStringTok{\{1{-}p}\SpecialCharTok{\}\}}\VerbatimStringTok{$"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{logistic_regression_1/logistic_reg_1_files/figure-pdf/cell-7-output-1.pdf}
+
+\subsubsection{3. Putting it Together}\label{putting-it-together}
+
+We see something promising -- the relationship between the log-odds and
+our input feature is approximately linear. This means that we can use a
+linear model to describe the relationship between the log-odds and
+\(x\). In other words:
+
+\begin{align}
+\log{(\frac{p}{1-p})} &= \theta_0 + \theta_1 x_1 + ... + \theta_p x_p\\
+&= x^{\top} \theta
+\end{align}
+
+Here, we use \(x^{\top}\) to represent an observation in our dataset,
+stored as a row vector. You can imagine it as a single row in our design
+matrix. \(x^{\top} \theta\) indicates a linear combination of the
+features for this observation (just as we used in multiple linear
+regression).
+
+We're in good shape! We have now derived the following relationship:
+
+\[\log{(\frac{p}{1-p})} = x^{\top} \theta\]
+
+Remember that our goal is to predict the probability of a datapoint
+belonging to Class 1, \(p\). Let's rearrange this relationship to
+uncover the original relationship between \(p\) and our input data,
+\(x^{\top}\).
+
+\begin{align}
+\log{(\frac{p}{1-p})} &= x^T \theta\\
+\frac{p}{1-p} &= e^{x^T \theta}\\
+p &= (1-p)e^{x^T \theta}\\
+p &= e^{x^T \theta}- p e^{x^T \theta}\\
+p(1 + e^{x^T \theta}) &= e^{x^T \theta} \\
+p &= \frac{e^{x^T \theta}}{1+e^{x^T \theta}}\\
+p &= \frac{1}{1+e^{-x^T \theta}}\\
+\end{align}
+
+Phew, that was a lot of algebra. What we've uncovered is the
+\textbf{logistic regression model} to predict the probability of a
+datapoint \(x^{\top}\) belonging to Class 1. If we plot this
+relationship for our data, we see the S-shaped curve from earlier!
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# We\textquotesingle{}ll discuss the \textasciigrave{}LogisticRegression\textasciigrave{} class next time}
+\NormalTok{xs }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\FloatTok{0.3}\NormalTok{, }\FloatTok{0.3}\NormalTok{)}
+
+\NormalTok{logistic\_model }\OperatorTok{=}\NormalTok{ lm.LogisticRegression(C}\OperatorTok{=}\DecValTok{20}\NormalTok{)}
+\NormalTok{logistic\_model.fit(X, Y)}
+\NormalTok{predicted\_prob }\OperatorTok{=}\NormalTok{ logistic\_model.predict\_proba(xs[:, np.newaxis])[:, }\DecValTok{1}\NormalTok{]}
+
+\NormalTok{sns.stripplot(data}\OperatorTok{=}\NormalTok{games, x}\OperatorTok{=}\StringTok{"GOAL\_DIFF"}\NormalTok{, y}\OperatorTok{=}\StringTok{"WON"}\NormalTok{, orient}\OperatorTok{=}\StringTok{"h"}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{)}
+\NormalTok{plt.plot(xs, predicted\_prob, c}\OperatorTok{=}\StringTok{"k"}\NormalTok{, lw}\OperatorTok{=}\DecValTok{3}\NormalTok{, label}\OperatorTok{=}\StringTok{"Logistic regression model"}\NormalTok{)}
+\NormalTok{plt.plot(win\_rates\_by\_bin.index, win\_rates\_by\_bin, lw}\OperatorTok{=}\DecValTok{2}\NormalTok{, c}\OperatorTok{=}\StringTok{"tab:red"}\NormalTok{, label}\OperatorTok{=}\StringTok{"Graph of averages"}\NormalTok{)}
+\NormalTok{plt.legend(loc}\OperatorTok{=}\StringTok{"upper left"}\NormalTok{)}
+\NormalTok{plt.gca().invert\_yaxis()}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{logistic_regression_1/logistic_reg_1_files/figure-pdf/cell-8-output-1.pdf}
+
+The S-shaped curve is formally known as the \textbf{sigmoid function}
+and is typically denoted by \(\sigma\).
+
+\[\sigma(t) = \frac{1}{1+e^{-t}}\]
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Properties of the Sigmoid}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Reflection/Symmetry:
+  \[1-\sigma(t) = \frac{e^{-t}}{1+e^{-t}}=\sigma(-t)\]
+\item
+  Inverse: \[t=\sigma^{-1}(p)=\log{(\frac{p}{1-p})}\]
+\item
+  Derivative:
+  \[\frac{d}{dz} \sigma(t) = \sigma(t) (1-\sigma(t))=\sigma(t)\sigma(-t)\]
+\item
+  Domain: \(-\infty < t < \infty\)
+\item
+  Range: \(0 < \sigma(t) < 1\)
+\end{itemize}
+
+\end{tcolorbox}
+
+In the context of our modeling process, the sigmoid is considered an
+\textbf{activation function}. It takes in a linear combination of the
+features and applies a non-linear transformation.
+
+\section{The Logistic Regression
+Model}\label{the-logistic-regression-model}
+
+To predict a probability using the logistic regression model, we:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Compute a linear combination of the features, \(x^{\top}\theta\)
+\item
+  Apply the sigmoid activation function, \(\sigma(x^{\top} \theta)\).
+\end{enumerate}
+
+Our predicted probabilities are of the form
+\(P(Y=1|x) = p = \frac{1}{1+e^{-x^T \theta}}  = \frac{1}{1+e^{-(\theta_0 + \theta_1 x_1 + \theta_2 x_2 + \ldots + \theta_p x_p)}}\)
+
+An important note: despite its name, logistic regression is used for
+\emph{classification} tasks, not regression tasks. In Data 100, we
+always apply logistic regression with the goal of classifying data.
+
+Let's summarize our logistic regression modeling workflow:
+
+Our main takeaways from this section are:
+
+\begin{itemize}
+\tightlist
+\item
+  Assume log-odds is a linear combination of \(x\) and \(\theta\)
+\item
+  Fit the ``S'' curve as best as possible
+\item
+  The curve models the probability: \(P = (Y=1 | x)\)
+\end{itemize}
+
+Putting this together, we know that the estimated probability that
+response is 1 given the features \(x\) is equal to the logistic function
+\(\sigma()\) at the value \(x^{\top}\theta\):
+
+\begin{align}
+\hat{P}_{\theta}(Y = 1 | x) = \frac{1}{1 + e^{-x^{\top}\theta}}
+\end{align}
+
+More commonly, the logistic regression model is written as:
+
+\begin{align}
+\hat{P}_{\theta}(Y = 1 | x) = \sigma(x^{\top}\theta)
+\end{align}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Properties of the Logistic Model}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+Consider a logistic regression model with one feature and an intercept
+term:
+
+\begin{align}
+p = P(Y = 1 | x) = \frac{1}{1+e^{-(\theta_0 + \theta_1 x)}}
+\end{align}
+
+Properties:
+
+\begin{itemize}
+\tightlist
+\item
+  \(\theta_0\) controls the position of the curve along the horizontal
+  axis
+\item
+  The magnitude of \(\theta_1\) controls the ``steepness'' of the
+  sigmoid
+\item
+  The sign of \(\theta_1\) controls the orientation of the curve
+\end{itemize}
+
+\end{tcolorbox}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-color-frame, left=2mm, breakable, rightrule=.15mm, bottomrule=.15mm, opacityback=0, toprule=.15mm, leftrule=.75mm, arc=.35mm, colback=white]
+
+\vspace{-3mm}\textbf{Example Calculation}\vspace{3mm}
+
+Suppose we want to predict the probability that a team wins a game,
+given \texttt{"GOAL\_DIFF"} (first feature) and the number of free
+throws (second feature). Let's say we fit a logistic regression model
+(with no intercept) using the training data and estimate the optimal
+parameters. Now we want to predict the probability that a new team will
+win their game.
+
+\begin{align}
+\hat{\theta}^{\top} = \begin{matrix}[0.1 & -0.5]\end{matrix}
+\\x^{\top} = \begin{matrix}[15 & 1]\end{matrix}
+\end{align}
+
+\begin{align}
+\hat{P}_{\hat{\theta}}(Y = 1 | x) = \sigma(x^{\top}\hat{\theta}) = \sigma(0.1 \cdot 15 + (-0.5) \cdot 1) = \sigma(1) = \frac{1}{1+e^{-1}} \approx 0.7311
+\end{align}
+
+We see that the response is more likely to be 1 than 0, indicating that
+a reasonable prediction is \(\hat{y} = 1\). We'll dive deeper into this
+in the next lecture.
+
+\end{tcolorbox}
+
+\section{Cross-Entropy Loss}\label{cross-entropy-loss}
+
+To quantify the error of our logistic regression model, we'll need to
+define a new loss function.
+
+\subsection{Why Not MSE?}\label{why-not-mse}
+
+You may wonder: why not use our familiar mean squared error? It turns
+out that the MSE is not well suited for logistic regression. To see why,
+let's consider a simple, artificially generated \texttt{toy} dataset
+with just one feature (this will be easier to work with than the more
+complicated \texttt{games} data).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{toy\_df }\OperatorTok{=}\NormalTok{ pd.DataFrame(\{}
+        \StringTok{"x"}\NormalTok{: [}\OperatorTok{{-}}\DecValTok{4}\NormalTok{, }\OperatorTok{{-}}\DecValTok{2}\NormalTok{, }\OperatorTok{{-}}\FloatTok{0.5}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{5}\NormalTok{],}
+        \StringTok{"y"}\NormalTok{: [}\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{]\})}
+\NormalTok{toy\_df.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& x & y \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & -4.0 & 0 \\
+1 & -2.0 & 0 \\
+2 & -0.5 & 1 \\
+3 & 1.0 & 0 \\
+4 & 3.0 & 1 \\
+\end{longtable}
+
+We'll construct a basic logistic regression model with only one feature
+and no intercept term. Our predicted probabilities take the form:
+
+\[p=P(Y=1|x)=\frac{1}{1+e^{-\theta_1 x}}\]
+
+In the cell below, we plot the MSE for our model on the data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ sigmoid(z):}
+    \ControlFlowTok{return} \DecValTok{1}\OperatorTok{/}\NormalTok{(}\DecValTok{1}\OperatorTok{+}\NormalTok{np.e}\OperatorTok{**}\NormalTok{(}\OperatorTok{{-}}\NormalTok{z))}
+    
+\KeywordTok{def}\NormalTok{ mse\_on\_toy\_data(theta):}
+\NormalTok{    p\_hat }\OperatorTok{=}\NormalTok{ sigmoid(toy\_df[}\StringTok{\textquotesingle{}x\textquotesingle{}}\NormalTok{] }\OperatorTok{*}\NormalTok{ theta)}
+    \ControlFlowTok{return}\NormalTok{ np.mean((toy\_df[}\StringTok{\textquotesingle{}y\textquotesingle{}}\NormalTok{] }\OperatorTok{{-}}\NormalTok{ p\_hat)}\OperatorTok{**}\DecValTok{2}\NormalTok{)}
+
+\NormalTok{thetas }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{15}\NormalTok{, }\DecValTok{5}\NormalTok{, }\DecValTok{100}\NormalTok{)}
+\NormalTok{plt.plot(thetas, [mse\_on\_toy\_data(theta) }\ControlFlowTok{for}\NormalTok{ theta }\KeywordTok{in}\NormalTok{ thetas])}
+\NormalTok{plt.title(}\StringTok{"MSE on toy classification data"}\NormalTok{)}
+\NormalTok{plt.xlabel(}\VerbatimStringTok{r\textquotesingle{}$\textbackslash{}theta\_1$\textquotesingle{}}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}MSE\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{logistic_regression_1/logistic_reg_1_files/figure-pdf/cell-10-output-1.pdf}
+
+This looks nothing like the parabola we found when plotting the MSE of a
+linear regression model! In particular, we can identify two flaws with
+using the MSE for logistic regression:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  The MSE loss surface is \emph{non-convex}. There is both a global
+  minimum and a (barely perceptible) local minimum in the loss surface
+  above. This means that there is the risk of gradient descent
+  converging on the local minimum of the loss surface, missing the true
+  optimum parameter \(\theta_1\).
+\item
+  Squared loss is \emph{bounded} for a classification task. Recall that
+  each true \(y\) has a value of either 0 or 1. This means that even if
+  our model makes the worst possible prediction (e.g.~predicting \(p=0\)
+  for \(y=1\)), the squared loss for an observation will be no greater
+  than 1: \[(y-p)^2=(1-0)^2=1\] The MSE does not strongly penalize poor
+  predictions.
+\end{enumerate}
+
+\subsection{Motivating Cross-Entropy
+Loss}\label{motivating-cross-entropy-loss}
+
+Suffice to say, we don't want to use the MSE when working with logistic
+regression. Instead, we'll consider what kind of behavior we would
+\emph{like} to see in a loss function.
+
+Let \(y\) be the binary label (it can either be 0 or 1), and \(p\) be
+the model's predicted probability of the label \(y\) being 1.
+
+\begin{itemize}
+\tightlist
+\item
+  When the true \(y\) is 1, we should incur \emph{low} loss when the
+  model predicts large \(p\)
+\item
+  When the true \(y\) is 0, we should incur \emph{high} loss when the
+  model predicts large \(p\)
+\end{itemize}
+
+In other words, our loss function should behave differently depending on
+the value of the true class, \(y\).
+
+The \textbf{cross-entropy loss} incorporates this changing behavior. We
+will use it throughout our work on logistic regression. Below, we write
+out the cross-entropy loss for a \emph{single} datapoint (no averages
+just yet).
+
+\[\text{Cross-Entropy Loss} = \begin{cases}
+  -\log{(p)}  & \text{if } y=1 \\
+  -\log{(1-p)} & \text{if } y=0
+\end{cases}\]
+
+Why does this (seemingly convoluted) loss function ``work''? Let's break
+it down.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.5000}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+When \(y=1\)
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+When \(y=0\)
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+& \\
+As \(p \rightarrow 0\), loss approches \(\infty\) & As
+\(p \rightarrow 0\), loss approches 0 \\
+As \(p \rightarrow 1\), loss approaches 0 & As \(p \rightarrow 1\), loss
+approaches \(\infty\) \\
+\end{longtable}
+
+All good -- we are seeing the behavior we want for our logistic
+regression model.
+
+The piecewise function we outlined above is difficult to optimize: we
+don't want to constantly ``check'' which form of the loss function we
+should be using at each step of choosing the optimal model parameters.
+We can re-express cross-entropy loss in a more convenient way:
+
+\[\text{Cross-Entropy Loss} = -\left(y\log{(p)}+(1-y)\log{(1-p)}\right)\]
+
+By setting \(y\) to 0 or 1, we see that this new form of cross-entropy
+loss gives us the same behavior as the original formulation. Another way
+to think about this is that in either scenario (y being equal to 0 or
+1), only one of the cross-entropy loss terms is activated, which gives
+us a convenient way to combine the two independent loss functions.
+
+When \(y=1\):
+
+\begin{align}
+\text{CE} &= -\left((1)\log{(p)}+(1-1)\log{(1-p)}\right)\\
+&= -\log{(p)}
+\end{align}
+
+When \(y=0\):
+
+\begin{align}
+\text{CE} &= -\left((0)\log{(p)}+(1-0)\log{(1-p)}\right)\\
+&= -\log{(1-p)}
+\end{align}
+
+The empirical risk of the logistic regression model is then the mean
+cross-entropy loss across all datapoints in the dataset. When fitting
+the model, we want to determine the model parameter \(\theta\) that
+leads to the lowest mean cross-entropy loss possible.
+
+\[
+\begin{align}
+R(\theta) &= - \frac{1}{n} \sum_{i=1}^n \left(y_i\log{(p_i)}+(1-y_i)\log{(1-p_i)}\right) \\
+&= - \frac{1}{n} \sum_{i=1}^n \left(y_i\log{\sigma(X_i^{\top}\theta)}+(1-y_i)\log{(1-\sigma(X_i^{\top}\theta))}\right)
+\end{align}
+\]
+
+The optimization problem is therefore to find the estimate
+\(\hat{\theta}\) that minimizes \(R(\theta)\):
+
+\[
+\hat{\theta} = \underset{\theta}{\arg\min} - \frac{1}{n} \sum_{i=1}^n \left(y_i\log{(\sigma(X_i^{\top}\theta))}+(1-y_i)\log{(1-\sigma(X_i^{\top}\theta))}\right) 
+\]
+
+Plotting the cross-entropy loss surface for our \texttt{toy} dataset
+gives us a more encouraging result -- our loss function is now convex.
+This means we can optimize it using gradient descent. Computing the
+gradient of the logistic model is fairly challenging, so we'll let
+\texttt{sklearn} take care of this for us. You won't need to compute the
+gradient of the logistic model in Data 100.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ cross\_entropy(y, p\_hat):}
+    \ControlFlowTok{return} \OperatorTok{{-}}\NormalTok{ y }\OperatorTok{*}\NormalTok{ np.log(p\_hat) }\OperatorTok{{-}}\NormalTok{ (}\DecValTok{1} \OperatorTok{{-}}\NormalTok{ y) }\OperatorTok{*}\NormalTok{ np.log(}\DecValTok{1} \OperatorTok{{-}}\NormalTok{ p\_hat)}
+
+\KeywordTok{def}\NormalTok{ mean\_cross\_entropy\_on\_toy\_data(theta):}
+\NormalTok{    p\_hat }\OperatorTok{=}\NormalTok{ sigmoid(toy\_df[}\StringTok{\textquotesingle{}x\textquotesingle{}}\NormalTok{] }\OperatorTok{*}\NormalTok{ theta)}
+    \ControlFlowTok{return}\NormalTok{ np.mean(cross\_entropy(toy\_df[}\StringTok{\textquotesingle{}y\textquotesingle{}}\NormalTok{], p\_hat))}
+
+\NormalTok{plt.plot(thetas, [mean\_cross\_entropy\_on\_toy\_data(theta) }\ControlFlowTok{for}\NormalTok{ theta }\KeywordTok{in}\NormalTok{ thetas], color }\OperatorTok{=} \StringTok{\textquotesingle{}green\textquotesingle{}}\NormalTok{)}
+\NormalTok{plt.ylabel(}\VerbatimStringTok{r\textquotesingle{}Mean Cross{-}Entropy Loss($\textbackslash{}theta$)\textquotesingle{}}\NormalTok{)}
+\NormalTok{plt.xlabel(}\VerbatimStringTok{r\textquotesingle{}$\textbackslash{}theta$\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{logistic_regression_1/logistic_reg_1_files/figure-pdf/cell-11-output-1.pdf}
+
+\section{Maximum Likelihood
+Estimation}\label{maximum-likelihood-estimation}
+
+It may have seemed like we pulled cross-entropy loss out of thin air.
+How did we know that taking the negative logarithms of our probabilities
+would work so well? It turns out that cross-entropy loss is justified by
+probability theory.
+
+The following section is out of scope, but is certainly an interesting
+read!
+
+\subsection{Building Intuition: The Coin
+Flip}\label{building-intuition-the-coin-flip}
+
+To build some intuition for logistic regression, let's look at an
+introductory example to classification: the coin flip. Suppose we
+observe some outcomes of a coin flip (1 = Heads, 0 = Tails).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{flips }\OperatorTok{=}\NormalTok{ [}\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{]}
+\NormalTok{flips}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+[0, 0, 1, 1, 1, 1, 0, 0, 0, 0]
+\end{verbatim}
+
+A reasonable model is to assume all flips are IID (independent and
+identically distributed). In other words, each flip has the same
+probability of returning a 1 (or heads). Let's define a parameter
+\(\theta\), the probability that the next flip is a heads. We will use
+this parameter to inform our decision for \(\hat y\) (predicting either
+0 or 1) of the next flip. If
+\(\theta \ge 0.5, \hat y = 1, \text{else } \hat y = 0\).
+
+You may be inclined to say \(0.5\) is the best choice for \(\theta\).
+However, notice that we made no assumption about the coin itself. The
+coin may be biased, so we should make our decision based only on the
+data. We know that exactly \(\frac{4}{10}\) of the flips were heads, so
+we might guess \(\hat \theta = 0.4\). In the next section, we will
+mathematically prove why this is the best possible estimate.
+
+\subsection{Likelihood of Data}\label{likelihood-of-data}
+
+Let's call the result of the coin flip a random variable \(Y\). This is
+a Bernoulli random variable with two outcomes. \(Y\) has the following
+distribution:
+
+\[P(Y = y) = \begin{cases}
+        p, \text{if }  y=1\\
+        1 - p, \text{if }  y=0
+    \end{cases} \]
+
+\(p\) is unknown to us. But we can find the \(p\) that makes the data we
+observed the most \emph{likely}.
+
+The probability of observing 4 heads and 6 tails follows the binomial
+distribution.
+
+\[\binom{10}{4} (p)^4 (1-p)^6\]
+
+We define the \textbf{likelihood} of obtaining our observed data as a
+quantity \emph{proportional} to the probability above. To find it,
+simply multiply the probabilities of obtaining each coin flip.
+
+\[(p)^{4} (1-p)^6\]
+
+The technique known as \textbf{maximum likelihood estimation} finds the
+\(p\) that maximizes the above likelihood. You can find this maximum by
+taking the derivative of the likelihood, but we'll provide a more
+intuitive graphical solution.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{thetas }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{)}
+\NormalTok{plt.plot(thetas, (thetas}\OperatorTok{**}\DecValTok{4}\NormalTok{)}\OperatorTok{*}\NormalTok{(}\DecValTok{1}\OperatorTok{{-}}\NormalTok{thetas)}\OperatorTok{**}\DecValTok{6}\NormalTok{)}
+\NormalTok{plt.xlabel(}\VerbatimStringTok{r"$\textbackslash{}theta$"}\NormalTok{)}
+\NormalTok{plt.ylabel(}\StringTok{"Likelihood"}\NormalTok{)}\OperatorTok{;}
+\end{Highlighting}
+\end{Shaded}
+
+\includegraphics{logistic_regression_1/logistic_reg_1_files/figure-pdf/cell-13-output-1.pdf}
+
+More generally, the likelihood for some Bernoulli(\(p\)) random variable
+\(Y\) is:
+
+\[P(Y = y) = \begin{cases}
+        1, \text{with probability }  p\\
+        0, \text{with probability }  1 - p
+    \end{cases} \]
+
+Equivalently, this can be written in a compact way:
+
+\[P(Y=y) = p^y(1-p)^{1-y}\]
+
+\begin{itemize}
+\tightlist
+\item
+  When \(y = 1\), this reads \(P(Y=y) = p\)
+\item
+  When \(y = 0\), this reads \(P(Y=y) = (1-p)\)
+\end{itemize}
+
+In our example, a Bernoulli random variable is analogous to a single
+data point (e.g., one instance of a basketball team winning or losing a
+game). All together, our \texttt{games} data consists of many IID
+Bernoulli(\(p\)) random variables. To find the likelihood of independent
+events in succession, simply multiply their likelihoods.
+
+\[\prod_{i=1}^{n} p^{y_i} (1-p)^{1-y_i}\]
+
+As with the coin example, we want to find the parameter \(p\) that
+maximizes this likelihood. Earlier, we gave an intuitive graphical
+solution, but let's take the derivative of the likelihood to find this
+maximum.
+
+At a first glance, this derivative will be complicated! We will have to
+use the product rule, followed by the chain rule. Instead, we can make
+an observation that simplifies the problem.
+
+Finding the \(p\) that maximizes
+\[\prod_{i=1}^{n} p^{y_i} (1-p)^{1-y_i}\] is equivalent to the \(p\)
+that maximizes \[\text{log}(\prod_{i=1}^{n} p^{y_i} (1-p)^{1-y_i})\]
+
+This is because \(\text{log}\) is a strictly \emph{increasing} function.
+It won't change the maximum or minimum of the function it was applied
+to. From \(\text{log}\) properties, \(\text{log}(a*b)\) =
+\(\text{log}(a) + \text{log}(b)\). We can apply this to our equation
+above to get:
+
+\[\underset{p}{\text{argmax}} \sum_{i=1}^{n} \text{log}(p^{y_i} (1-p)^{1-y_i})\]
+
+\[= \underset{p}{\text{argmax}} \sum_{i=1}^{n} (\text{log}(p^{y_i}) + \text{log}((1-p)^{1-y_i}))\]
+
+\[= \underset{p}{\text{argmax}} \sum_{i=1}^{n} (y_i\text{log}(p) + (1-y_i)\text{log}(1-p))\]
+
+We can add a constant factor of \(\frac{1}{n}\) out front. It won't
+affect the \(p\) that maximizes our likelihood.
+
+\[=\underset{p}{\text{argmax}}  \frac{1}{n} \sum_{i=1}^{n} y_i\text{log}(p) + (1-y_i)\text{log}(1-p)\]
+
+One last ``trick'' we can do is change this to a minimization problem by
+negating the result. This works because we are dealing with a
+\emph{concave} function, which can be made \emph{convex}.
+
+\[= \underset{p}{\text{argmin}} -\frac{1}{n} \sum_{i=1}^{n} y_i\text{log}(p) + (1-y_i)\text{log}(1-p)\]
+
+Now let's say that we have data that are independent with different
+probability \(p_i\). Then, we would want to find the
+\(p_1, p_2, \dots, p_n\) that maximize
+\[\prod_{i=1}^{n} p_i^{y_i} (1-p_i)^{1-y_i}\]
+
+Setting up and simplifying the optimization problems as we did above, we
+ultimately want to find:
+
+\[= \underset{p}{\text{argmin}} -\frac{1}{n} \sum_{i=1}^{n} y_i\text{log}(p_i) + (1-y_i)\text{log}(1-p_i)\]
+
+For logistic regression, \(p_i = \sigma(x^{\top}\theta)\). Plugging that
+in, we get:
+
+\[= \underset{p}{\text{argmin}} -\frac{1}{n} \sum_{i=1}^{n} y_i\text{log}(\sigma(x^{\top}\theta)) + (1-y_i)\text{log}(1-\sigma(x^{\top}\theta))\]
+
+This is exactly our average cross-entropy loss minimization problem from
+before!
+
+Why did we do all this complicated math? We have shown that
+\emph{minimizing} cross-entropy loss is equivalent to \emph{maximizing}
+the likelihood of the training data.
+
+\begin{itemize}
+\tightlist
+\item
+  By minimizing cross-entropy loss, we are choosing the model parameters
+  that are ``most likely'' for the data we observed.
+\end{itemize}
+
+Note that this is under the assumption that all data is drawn
+independently from the same logistic regression model with parameter
+\(\theta\). In fact, many of the model + loss combinations we've seen
+can be motivated using MLE (e.g., OLS, Ridge Regression, etc.). In
+probability and ML classes, you'll get the chance to explore MLE
+further.
+
+\bookmarksetup{startatroot}
+
+\chapter{Logistic Regression II}\label{logistic-regression-ii}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Apply decision rules to make a classification
+\item
+  Learn when logistic regression works well and when it does not
+\item
+  Introduce new metrics for model performance
+\end{itemize}
+
+\end{tcolorbox}
+
+Today, we will continue studying the Logistic Regression model and
+discuss decision boundaries that help inform the classification of a
+particular prediction and learn about linear separability. Picking up
+from last lecture's discussion of cross-entropy loss, we'll study a few
+of its pitfalls, and learn potential remedies. We will also provide an
+implementation of \texttt{sklearn}'s logistic regression model. Lastly,
+we'll return to decision rules and discuss metrics that allow us to
+determine our model's performance in different scenarios.
+
+This will introduce us to the process of \textbf{thresholding} -- a
+technique used to \emph{classify} data from our model's predicted
+probabilities, or \(P(Y=1|x)\). In doing so, we'll focus on how these
+thresholding decisions affect the behavior of our model and learn
+various evaluation metrics useful for binary classification, and apply
+them to our study of logistic regression.
+
+\section{Decision Boundaries}\label{decision-boundaries}
+
+In logistic regression, we model the \emph{probability} that a datapoint
+belongs to Class 1.
+
+Last week, we developed the logistic regression model to predict that
+probability, but we never actually made any \emph{classifications} for
+whether our prediction \(y\) belongs in Class 0 or Class 1.
+
+\[ p = P(Y=1 | x) = \frac{1}{1 + e^{-x^{\top}\theta}}\]
+
+A \textbf{decision rule} tells us how to interpret the output of the
+model to make a decision on how to classify a datapoint. We commonly
+make decision rules by specifying a \textbf{threshold}, \(T\). If the
+predicted probability is greater than or equal to \(T\), predict Class
+1. Otherwise, predict Class 0.
+
+\[\hat y = \text{classify}(x) = \begin{cases}
+        1, & P(Y=1|x) \ge T\\
+        0, & \text{otherwise }
+    \end{cases}\]
+
+The threshold is often set to \(T = 0.5\), but \emph{not always}. We'll
+discuss why we might want to use other thresholds \(T \neq 0.5\) later
+in this lecture.
+
+Using our decision rule, we can define a \textbf{decision boundary} as
+the ``line'' that splits the data into classes based on its features.
+For logistic regression, since we are working in \(p\) dimensions, the
+decision boundary is a \textbf{hyperplane} -- a linear combination of
+the features in \(p\)-dimensions -- and we can recover it from the final
+logistic regression model. For example, if we have a model with 2
+features (2D), we have \(\theta = [\theta_0, \theta_1, \theta_2]\)
+including the intercept term, and we can solve for the decision boundary
+like so:
+
+\[
+\begin{align}
+T &= \frac{1}{1 + e^{-(\theta_0 + \theta_1 * \text{feature1} +  \theta_2 * \text{feature2})}} \\
+1 + e^{-(\theta_0 + \theta_1 \cdot \text{feature1} +  \theta_2  \cdot  \text{feature2})} &= \frac{1}{T} \\
+e^{-(\theta_0 + \theta_1  \cdot  \text{feature1} +  \theta_2  \cdot  \text{feature2})} &= \frac{1}{T} - 1 \\
+\theta_0 + \theta_1  \cdot  \text{feature1} +  \theta_2  \cdot  \text{feature2} &= -\log(\frac{1}{T} - 1)
+\end{align} 
+\]
+
+For a model with 2 features, the decision boundary is a line in terms of
+its features. To make it easier to visualize, we've included an example
+of a 1-dimensional and a 2-dimensional decision boundary below. Notice
+how the decision boundary predicted by our logistic regression model
+perfectly separates the points into two classes. Here the color is the
+\emph{predicted} class, rather than the true class.
+
+In real life, however, that is often not the case, and we often see some
+overlap between points of different classes across the decision
+boundary. The \emph{true} classes of the 2D data are shown below:
+
+As you can see, the decision boundary predicted by our logistic
+regression does not perfectly separate the two classes. There's a
+``muddled'' region near the decision boundary where our classifier
+predicts the wrong class. What would the data have to look like for the
+classifier to make perfect predictions?
+
+\section{Linear Separability and
+Regularization}\label{linear-separability-and-regularization}
+
+A classification dataset is said to be \textbf{linearly separable} if
+there exists a hyperplane \textbf{among input features \(x\)} that
+separates the two classes \(y\).
+
+Linear separability in 1D can be found with a rugplot of a single
+feature where a point perfectly separates the classes (Remember that in
+1D, our decision boundary is just a point). For example, notice how the
+plot on the bottom left is linearly separable along the vertical line
+\(x=0\). However, no such line perfectly separates the two classes on
+the bottom right.
+
+This same definition holds in higher dimensions. If there are two
+features, the separating hyperplane must exist in two dimensions (any
+line of the form \(y=mx+b\)). We can visualize this using a scatter
+plot.
+
+This sounds great! When the dataset is linearly separable, a logistic
+regression classifier can perfectly assign datapoints into classes. Can
+it achieve 0 cross-entropy loss?
+
+\[-(y \log(p) + (1 - y) \log(1 - p))\]
+
+Cross-entropy loss is 0 if \(p = 1\) when \(y = 1\), and \(p = 0\) when
+\(y = 0\). Consider a simple model with one feature and no intercept.
+
+\[P_{\theta}(Y = 1|x) = \sigma(\theta x) = \frac{1}{1 + e^{-\theta x}}\]
+
+What \(\theta\) will achieve 0 loss if we train on the datapoint
+\(x = 1, y = 1\)? We would want \(p = 1\) which occurs when
+\(\theta \rightarrow \infty\).
+
+However, (unexpected) complications may arise. When data is linearly
+separable, the optimal model parameters \textbf{diverge} to
+\(\pm \infty\). \emph{The sigmoid can never output exactly 0 or 1}, so
+no finite optimal \(\theta\) exists. This can be a problem when using
+gradient descent to fit the model. Consider a simple, linearly separable
+``toy'' dataset with two datapoints.
+
+Let's also visualize the mean cross entropy loss along with the
+direction of the gradient (how this loss surface is calculated is out of
+scope).
+
+It's nearly impossible to see, but the plateau to the right is slightly
+tilted. Because gradient descent follows the tilted loss surface
+downwards, it never converges.
+
+The diverging weights cause the model to be \textbf{overconfident}. Say
+we add a new point \((x, y) = (-0.5, 1)\). Following the behavior above,
+our model will incorrectly predict \(p=0\), and thus, \(\hat y = 0\).
+
+The loss incurred by this misclassified point is infinite.
+
+\[-(y\text{ log}(p) + (1-y)\text{ log}(1-p))=1 * \text{log}(0)\]
+
+Thus, diverging weights (\(|\theta| \rightarrow \infty\)) occur with
+\textbf{linearly separable} data. ``Overconfidence'', as shown here, is
+a particularly dangerous version of overfitting.
+
+\subsection{Regularized Logistic
+Regression}\label{regularized-logistic-regression}
+
+To avoid large weights and infinite loss (particularly on linearly
+separable data), we use regularization. The same principles apply as
+with linear regression - make sure to standardize your features first.
+
+For example, \(L2\) (Ridge) Logistic Regression takes on the form:
+
+\[\min_{\theta} -\frac{1}{n} \sum_{i=1}^{n} (y_i \text{log}(\sigma(X_i^T\theta)) + (1-y_i)\text{log}(1-\sigma(X_i^T\theta))) + \lambda \sum_{j=1}^{d} \theta_j^2\]
+
+Now, let us compare the loss functions of un-regularized and regularized
+logistic regression.
+
+As we can see, \(L2\) regularization helps us prevent diverging weights
+and deters against ``overconfidence.''
+
+\texttt{sklearn}'s logistic regression defaults to \(L2\) regularization
+and \texttt{C=1.0}; \texttt{C} is the inverse of \(\lambda\):
+\[C = \frac{1}{\lambda}\] Setting \texttt{C} to a large value, for
+example, \texttt{C=300.0}, results in minimal regularization.
+
+\begin{verbatim}
+# sklearn defaults
+model = LogisticRegression(penalty = 'l2', C = 1.0, ...)
+model.fit()
+\end{verbatim}
+
+Note that in Data 100, we only use \texttt{sklearn} to fit logistic
+regression models. There is no closed-form solution to the optimal theta
+vector, and the gradient is a little messy (see the bonus section below
+for details).
+
+From here, the \texttt{.predict} function returns the predicted class
+\(\hat y\) of the point. In the simple binary case where the threshold
+is 0.5,
+
+\[\hat y = \begin{cases}
+        1, & P(Y=1|x) \ge 0.5\\
+        0, & \text{otherwise }
+    \end{cases}\]
+
+\section{Performance Metrics}\label{performance-metrics}
+
+You might be thinking, if we've already introduced cross-entropy loss,
+why do we need additional ways of assessing how well our models perform?
+In linear regression, we made numerical predictions and used a loss
+function to determine how ``good'' these predictions were. In logistic
+regression, our ultimate goal is to classify data -- we are much more
+concerned with whether or not each datapoint was assigned the correct
+class using the decision rule. As such, we are interested in the
+\emph{quality} of classifications, not the predicted probabilities.
+
+The most basic evaluation metric is \textbf{accuracy}, that is, the
+proportion of correctly classified points.
+
+\[\text{accuracy} = \frac{\# \text{ of points classified correctly}}{\# \text{ of total points}}\]
+
+Translated to code:
+
+\begin{verbatim}
+def accuracy(X, Y):
+    return np.mean(model.predict(X) == Y)
+    
+model.score(X, y) # built-in accuracy function
+\end{verbatim}
+
+You can find the \texttt{sklearn} documentation
+\href{https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\#sklearn.linear_model.LogisticRegression.score}{here}.
+
+However, accuracy is not always a great metric for classification. To
+understand why, let's consider a classification problem with 100 emails
+where only 5 are truly spam, and the remaining 95 are truly ham. We'll
+investigate two models where accuracy is a poor metric.
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Model 1}: Our first model classifies every email as non-spam.
+  The model's accuracy is high (\(\frac{95}{100} = 0.95\)), but it
+  doesn't detect any spam emails. Despite the high accuracy, this is a
+  bad model.
+\item
+  \textbf{Model 2}: The second model classifies every email as spam. The
+  accuracy is low (\(\frac{5}{100} = 0.05\)), but the model correctly
+  labels every spam email. Unfortunately, it also misclassifies every
+  non-spam email.
+\end{itemize}
+
+As this example illustrates, accuracy is not always a good metric for
+classification, particularly when your data could exhibit class
+imbalance (e.g., very few 1's compared to 0's).
+
+\subsection{Types of Classification}\label{types-of-classification}
+
+There are 4 different different classifications that our model might
+make:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \textbf{True positive}: correctly classify a positive point as being
+  positive (\(y=1\) and \(\hat{y}=1\))
+\item
+  \textbf{True negative}: correctly classify a negative point as being
+  negative (\(y=0\) and \(\hat{y}=0\))
+\item
+  \textbf{False positive}: incorrectly classify a negative point as
+  being positive (\(y=0\) and \(\hat{y}=1\))
+\item
+  \textbf{False negative}: incorrectly classify a positive point as
+  being negative (\(y=1\) and \(\hat{y}=0\))
+\end{enumerate}
+
+These classifications can be concisely summarized in a \textbf{confusion
+matrix}.
+
+An easy way to remember this terminology is as follows:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Look at the second word in the phrase. \emph{Positive} means a
+  prediction of 1. \emph{Negative} means a prediction of 0.
+\item
+  Look at the first word in the phrase. \emph{True} means our prediction
+  was correct. \emph{False} means it was incorrect.
+\end{enumerate}
+
+We can now write the accuracy calculation as
+\[\text{accuracy} = \frac{TP + TN}{n}\]
+
+In \texttt{sklearn}, we use the following syntax to plot a confusion
+matrix:
+
+\begin{verbatim}
+from sklearn.metrics import confusion_matrix
+cm = confusion_matrix(Y_true, Y_pred)
+\end{verbatim}
+
+\subsection{Accuracy, Precision, and
+Recall}\label{accuracy-precision-and-recall}
+
+The purpose of our discussion of the confusion matrix was to motivate
+better performance metrics for classification problems with class
+imbalance - namely, precision and recall.
+
+\textbf{Precision} is defined as
+
+\[\text{precision} = \frac{\text{TP}}{\text{TP + FP}}\]
+
+Precision answers the question: ``Of all observations that were
+predicted to be \(1\), what proportion was actually \(1\)?'' It measures
+how accurate the classifier is when its predictions are positive.
+
+\textbf{Recall} (or \textbf{sensitivity}) is defined as
+
+\[\text{recall} = \frac{\text{TP}}{\text{TP + FN}}\]
+
+Recall aims to answer: ``Of all observations that were actually \(1\),
+what proportion was predicted to be \(1\)?'' It measures how many
+positive predictions were missed.
+
+Here's a helpful graphic that summarizes our discussion above.
+
+\subsection{Example Calculation}\label{example-calculation-1}
+
+In this section, we will calculate the accuracy, precision, and recall
+performance metrics for our earlier spam classification example. As a
+reminder, we had 100 emails, 5 of which were spam. We designed two
+models:
+
+\begin{itemize}
+\tightlist
+\item
+  Model 1: Predict that every email is \emph{non-spam}
+\item
+  Model 2: Predict that every email is \emph{spam}
+\end{itemize}
+
+\subsubsection{Model 1}\label{model-1}
+
+First, let's begin by creating the confusion matrix.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2778}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2778}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3889}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+0
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+1
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & True Negative: 95 & False Positive: 0 \\
+1 & False Negative: 5 & True Positive: 0 \\
+\end{longtable}
+
+\[\text{accuracy} = \frac{95}{100} = 0.95\]
+\[\text{precision} = \frac{0}{0 + 0} = \text{undefined}\]
+\[\text{recall} = \frac{0}{0 + 5} = 0\]
+
+Notice how our precision is undefined because we never predicted class
+\(1\). Our recall is 0 for the same reason -- the numerator is 0 (we had
+no positive predictions).
+
+\subsubsection{Model 2}\label{model-2}
+
+The confusion matrix for Model 2 is:
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2778}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2778}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3889}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+0
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+1
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & True Negative: 0 & False Positive: 95 \\
+1 & False Negative: 0 & True Positive: 5 \\
+\end{longtable}
+
+\[\text{accuracy} = \frac{5}{100} = 0.05\]
+\[\text{precision} = \frac{5}{5 + 95} = 0.05\]
+\[\text{recall} = \frac{5}{5 + 0} = 1\]
+
+Our precision is low because we have many false positives, and our
+recall is perfect - we correctly classified all spam emails (we never
+predicted class \(0\)).
+
+\subsection{Precision vs.~Recall}\label{precision-vs.-recall}
+
+Precision (\(\frac{\text{TP}}{\text{TP} + \textbf{ FP}}\)) penalizes
+false positives, while recall
+(\(\frac{\text{TP}}{\text{TP} + \textbf{ FN}}\)) penalizes false
+negatives. In fact, precision and recall are \emph{inversely related}.
+This is evident in our second model -- we observed a high recall and low
+precision. Usually, there is a tradeoff in these two (most models can
+either minimize the number of FP or FN; and in rare cases, both).
+
+The specific performance metric(s) to prioritize depends on the context.
+In many medical settings, there might be a much higher cost to missing
+positive cases. For instance, in our breast cancer example, it is more
+costly to misclassify malignant tumors (false negatives) than it is to
+incorrectly classify a benign tumor as malignant (false positives). In
+the case of the latter, pathologists can conduct further studies to
+verify malignant tumors. As such, we should minimize the number of false
+negatives. This is equivalent to maximizing recall.
+
+\subsection{Three More Metrics}\label{three-more-metrics}
+
+The \textbf{True Positive Rate (TPR)} is defined as
+
+\[\text{true positive rate} = \frac{\text{TP}}{\text{TP + FN}}\]
+
+You'll notice this is equivalent to \emph{recall}. In the context of our
+spam email classifier, it answers the question: ``What proportion of
+spam did I mark correctly?''. We'd like this to be close to \(1\).
+
+The \textbf{True Negative Rate (TNR)} is defined as
+
+\[\text{true negative rate} = \frac{\text{TN}}{\text{TN + FP}}\]
+
+Another word for TNR is \emph{specificity}. This answers the question:
+``What proportion of ham did I mark correctly?''. We'd like this to be
+close to \(1\).
+
+The \textbf{False Positive Rate (FPR)} is defined as
+
+\[\text{false positive rate} = \frac{\text{FP}}{\text{FP + TN}}\]
+
+FPR is equal to \emph{1 - specificity}, or \emph{1 - TNR}. This answers
+the question: ``What proportion of regular email did I mark as spam?''.
+We'd like this to be close to \(0\).
+
+As we increase threshold \(T\), both TPR and FPR decrease. We've plotted
+this relationship below for some model on a \texttt{toy} dataset.
+
+\section{Adjusting the Classification
+Threshold}\label{adjusting-the-classification-threshold}
+
+One way to minimize the number of FP vs.~FN (equivalently, maximizing
+precision vs.~recall) is by adjusting the classification threshold
+\(T\).
+
+\[\hat y = \begin{cases}
+        1, & P(Y=1|x) \ge T\\
+        0, & \text{otherwise }
+    \end{cases}\]
+
+The default threshold in \texttt{sklearn} is \(T = 0.5\). As we increase
+the threshold \(T\), we ``raise the standard'' of how confident our
+classifier needs to be to predict 1 (i.e., ``positive'').
+
+As you may notice, the choice of threshold \(T\) impacts our
+classifier's performance.
+
+\begin{itemize}
+\tightlist
+\item
+  High \(T\): Most predictions are \(0\).
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Lots of false negatives
+  \item
+    Fewer false positives
+  \end{itemize}
+\item
+  Low \(T\): Most predictions are \(1\).
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Lots of false positives
+  \item
+    Fewer false negatives
+  \end{itemize}
+\end{itemize}
+
+In fact, we can choose a threshold \(T\) based on our desired number, or
+proportion, of false positives and false negatives. We can do so using a
+few different tools. We'll touch on two of the most important ones in
+Data 100.
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Precision-Recall Curve (PR Curve)
+\item
+  ``Receiver Operating Characteristic'' Curve (ROC Curve)
+\end{enumerate}
+
+\subsection{Precision-Recall Curves}\label{precision-recall-curves}
+
+A \textbf{Precision-Recall Curve (PR Curve)} is an alternative to the
+ROC curve that displays the relationship between precision and recall
+for various threshold values. In this curve, we test out many different
+possible thresholds and for each one we compute the precision and recall
+of the classifier.
+
+Let's first consider how precision and recall change as a function of
+the threshold \(T\). We know this quite well from earlier -- precision
+will generally increase, and recall will decrease.
+
+Displayed below is the PR Curve for the same \texttt{toy} dataset.
+Notice how threshold values increase as we move to the left.
+
+Once again, the perfect classifier will resemble the orange curve, this
+time, facing the opposite direction.
+
+We want our PR curve to be as close to the ``top right'' of this graph
+as possible. Again, we use the AUC to determine ``closeness'', with the
+perfect classifier exhibiting an AUC = 1 (and the worst with an AUC =
+0.5).
+
+\subsection{The ROC Curve}\label{the-roc-curve}
+
+The ``Receiver Operating Characteristic'' Curve (\textbf{ROC Curve})
+plots the tradeoff between FPR and TPR. Notice how the far-left of the
+curve corresponds to higher threshold \(T\) values. At lower thresholds,
+the FPR and TPR are both high as there are many positive predictions
+while at higher thresholds the FPR and TPR are both low as there are
+fewer positive predictions.
+
+The ``perfect'' classifier is the one that has a TPR of 1, and FPR of 0.
+This is achieved at the top-left of the plot below. More generally, it's
+ROC curve resembles the curve in orange.
+
+We want our model to be as close to this orange curve as possible. How
+do we quantify ``closeness''?
+
+We can compute the \textbf{area under curve (AUC)} of the ROC curve.
+Notice how the perfect classifier has an AUC = 1. The closer our model's
+AUC is to 1, the better it is.
+
+\subsubsection{(Extra) What is the ``worst'' AUC, and why is it
+0.5?}\label{extra-what-is-the-worst-auc-and-why-is-it-0.5}
+
+On the other hand, a terrible model will have an AUC closer to 0.5.
+Random predictors randomly predict \(P(Y = 1 | x)\) to be uniformly
+between 0 and 1. This indicates the classifier is not able to
+distinguish between positive and negative classes, and thus, randomly
+predicts one of the two.
+
+We can also illustrate this by comparing different thresholds and seeing
+their points on the ROC curve.
+
+\section{(Bonus) Gradient Descent for Logistic
+Regression}\label{bonus-gradient-descent-for-logistic-regression}
+
+Let's define the following terms: \[
+\begin{align}
+t_i &= \phi(x_i)^T \theta \\
+p_i &= \sigma(t_i) \\
+t_i &= \log(\frac{p_i}{1 - p_i}) \\
+1 - \sigma(t_i) &= \sigma(-t_i) \\
+\frac{d}{dt}  \sigma(t) &=  \sigma(t) \sigma(-t)
+\end{align}
+\]
+
+Now, we can simplify the cross-entropy loss \[
+\begin{align}
+y_i \log(p_i) + (1 - y_i) \log(1 - p_i) &= y_i \log(\frac{p_i}{1 - p_i}) + \log(1 - p_i) \\
+&= y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta))
+\end{align}
+\]
+
+Hence, the optimal \(\hat{\theta}\) is
+\[\text{argmin}_{\theta} - \frac{1}{n} \sum_{i=1}^n (y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta)))\]
+
+We want to minimize
+\[L(\theta) = - \frac{1}{n} \sum_{i=1}^n (y_i \phi(x_i)^T + \log(\sigma(-\phi(x_i)^T \theta)))\]
+
+So we take the derivative \[ 
+\begin{align}
+\triangledown_{\theta} L(\theta) &= - \frac{1}{n} \sum_{i=1}^n \triangledown_{\theta} y_i \phi(x_i)^T + \triangledown_{\theta} \log(\sigma(-\phi(x_i)^T \theta)) \\
+&= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \triangledown_{\theta} \log(\sigma(-\phi(x_i)^T \theta)) \\
+&= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \frac{1}{\sigma(-\phi(x_i)^T \theta)} \triangledown_{\theta} \sigma(-\phi(x_i)^T \theta) \\
+&= - \frac{1}{n} \sum_{i=1}^n y_i \phi(x_i) + \frac{\sigma(-\phi(x_i)^T \theta)}{\sigma(-\phi(x_i)^T \theta)} \sigma(\phi(x_i)^T \theta)\triangledown_{\theta} \sigma(-\phi(x_i)^T \theta) \\
+&= - \frac{1}{n} \sum_{i=1}^n (y_i - \sigma(\phi(x_i)^T \theta)\phi(x_i))
+\end{align}
+\]
+
+Setting the derivative equal to 0 and solving for \(\hat{\theta}\), we
+find that there's no general analytic solution. Therefore, we must solve
+using numeric methods.
+
+\subsection{Gradient Descent Update
+Rule}\label{gradient-descent-update-rule}
+
+\[\theta^{(0)} \leftarrow \text{initial vector (random, zeros, ...)} \]
+
+For \(\tau\) from 0 to convergence:
+\[ \theta^{(\tau + 1)} \leftarrow \theta^{(\tau)} - \rho(\tau)\left( \frac{1}{n} \sum_{i=1}^n \triangledown_{\theta} L_i(\theta) \mid_{\theta = \theta^{(\tau)}}\right) \]
+
+\subsection{Stochastic Gradient Descent Update
+Rule}\label{stochastic-gradient-descent-update-rule}
+
+\[\theta^{(0)} \leftarrow \text{initial vector (random, zeros, ...)} \]
+
+For \(\tau\) from 0 to convergence, let \(B\) \textasciitilde{}
+\(\text{Random subset of indices}\).
+\[ \theta^{(\tau + 1)} \leftarrow \theta^{(\tau)} - \rho(\tau)\left( \frac{1}{|B|} \sum_{i \in B} \triangledown_{\theta} L_i(\theta) \mid_{\theta = \theta^{(\tau)}}\right) \]
+
+\bookmarksetup{startatroot}
+
+\chapter{PCA I}\label{pca-i}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Discuss the dimensionality of a dataset and strategies for
+  dimensionality reduction
+\item
+  Derive and carry out the procedure of PCA
+\end{itemize}
+
+\end{tcolorbox}
+
+So far in this course, we've focused on \textbf{supervised learning}
+techniques that create a function to map inputs (features) to labelled
+outputs. Regression and classification are two main examples, where the
+output value of regression is \emph{quantitative} while the output value
+of classification is \emph{categorical}.
+
+Today, we'll introduce an \textbf{unsupervised learning} technique
+called PCA. Unlike supervised learning, unsupervised learning is applied
+to \emph{unlabeled} data. Because we have features but no labels, we aim
+to identify patterns in those features.
+
+\section{Visualization (Revisited)}\label{visualization-revisited}
+
+Visualization can help us identify clusters or patterns in our dataset,
+and it can give us an intuition about our data and how to clean it for
+the model. For this demo, we'll return to the MPG dataset from Lecture
+19 and see how far we can push visualization for multiple features.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ scipy }\ImportTok{as}\NormalTok{ sp}
+\ImportTok{import}\NormalTok{ plotly.express }\ImportTok{as}\NormalTok{ px}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{mpg }\OperatorTok{=}\NormalTok{ sns.load\_dataset(}\StringTok{"mpg"}\NormalTok{).dropna()}
+\NormalTok{mpg.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllllllll@{}}
+\toprule\noalign{}
+& mpg & cylinders & displacement & horsepower & weight & acceleration &
+model\_year & origin & name \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 18.0 & 8 & 307.0 & 130.0 & 3504 & 12.0 & 70 & usa & chevrolet
+chevelle malibu \\
+1 & 15.0 & 8 & 350.0 & 165.0 & 3693 & 11.5 & 70 & usa & buick skylark
+320 \\
+2 & 18.0 & 8 & 318.0 & 150.0 & 3436 & 11.0 & 70 & usa & plymouth
+satellite \\
+3 & 16.0 & 8 & 304.0 & 150.0 & 3433 & 12.0 & 70 & usa & amc rebel sst \\
+4 & 17.0 & 8 & 302.0 & 140.0 & 3449 & 10.5 & 70 & usa & ford torino \\
+\end{longtable}
+
+We can plot one feature as a histogram to see it's distribution. Since
+we only plot one feature, we consider this a 1-dimensional plot.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{px.histogram(mpg, x}\OperatorTok{=}\StringTok{"displacement"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+We can also visualize two features (2-dimensional scatter plot):
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{px.scatter(mpg, x}\OperatorTok{=}\StringTok{"displacement"}\NormalTok{, y}\OperatorTok{=}\StringTok{"horsepower"}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+Three features (3-dimensional scatter plot):
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.scatter\_3d(mpg, x}\OperatorTok{=}\StringTok{"displacement"}\NormalTok{, y}\OperatorTok{=}\StringTok{"horsepower"}\NormalTok{, z}\OperatorTok{=}\StringTok{"weight"}\NormalTok{,}
+\NormalTok{                    width}\OperatorTok{=}\DecValTok{800}\NormalTok{, height}\OperatorTok{=}\DecValTok{800}\NormalTok{)}
+\NormalTok{fig.update\_traces(marker}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(size}\OperatorTok{=}\DecValTok{3}\NormalTok{))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+We can even push to 4 features using a 3D scatter plot and a colorbar:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.scatter\_3d(mpg, x}\OperatorTok{=}\StringTok{"displacement"}\NormalTok{, }
+\NormalTok{                    y}\OperatorTok{=}\StringTok{"horsepower"}\NormalTok{, }
+\NormalTok{                    z}\OperatorTok{=}\StringTok{"weight"}\NormalTok{, }
+\NormalTok{                    color}\OperatorTok{=}\StringTok{"model\_year"}\NormalTok{,}
+\NormalTok{                    width}\OperatorTok{=}\DecValTok{800}\NormalTok{, height}\OperatorTok{=}\DecValTok{800}\NormalTok{, }
+\NormalTok{                    opacity}\OperatorTok{=}\FloatTok{.7}\NormalTok{)}
+\NormalTok{fig.update\_traces(marker}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(size}\OperatorTok{=}\DecValTok{5}\NormalTok{))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+Visualizing 5 features is also possible if we make the scatter dots
+unique to the datapoint's origin.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.scatter\_3d(mpg, x}\OperatorTok{=}\StringTok{"displacement"}\NormalTok{, }
+\NormalTok{                    y}\OperatorTok{=}\StringTok{"horsepower"}\NormalTok{, }
+\NormalTok{                    z}\OperatorTok{=}\StringTok{"weight"}\NormalTok{, }
+\NormalTok{                    color}\OperatorTok{=}\StringTok{"model\_year"}\NormalTok{,}
+\NormalTok{                    size}\OperatorTok{=}\StringTok{"mpg"}\NormalTok{,}
+\NormalTok{                    symbol}\OperatorTok{=}\StringTok{"origin"}\NormalTok{,}
+\NormalTok{                    width}\OperatorTok{=}\DecValTok{900}\NormalTok{, height}\OperatorTok{=}\DecValTok{800}\NormalTok{, }
+\NormalTok{                    opacity}\OperatorTok{=}\FloatTok{.7}\NormalTok{)}
+\CommentTok{\# hide color scale legend on the plotly fig}
+\NormalTok{fig.update\_layout(coloraxis\_showscale}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+However, adding more features to our visualization can make our plot
+look messy and uninformative, and it can also be near impossible if we
+have a large number of features. The problem is that many datasets come
+with more than 5 features ------ hundreds, even. Is it still possible to
+visualize all those features?
+
+\section{Dimensionality}\label{dimensionality}
+
+Suppose we have a dataset of:
+
+\begin{itemize}
+\tightlist
+\item
+  \(N\) observations (datapoints/rows)
+\item
+  \(d\) attributes (features/columns)
+\end{itemize}
+
+Let's ``rename'' this in terms of linear algebra so that we can be more
+clear with our wording. Using linear algebra, we can view our matrix as:
+
+\begin{itemize}
+\tightlist
+\item
+  \(N\) row vectors in a \(d\)-Dimensional space, OR
+\item
+  \(d\) column vectors in an \(N\)-Dimensions space
+\end{itemize}
+
+The \textbf{intrinsic dimension} of a dataset is the \emph{minimal set
+of dimensions} needed to approximately represent the data. In linear
+algebra terms, it is the \textbf{dimension of the column space} of a
+matrix, or the number of linearly independent columns in a matrix; this
+is equivalently called the \textbf{rank} of a matrix.
+
+In the examples below, Dataset 1 has 2 dimensions because it has 2
+linearly independent columns. Similarly, Dataset 2 has 3 dimensions
+because it has 3 linearly independent columns.
+
+What about Dataset 4 below?
+
+It may be tempting to say that it has 4 dimensions, but the
+\texttt{Weight\ (lbs)} column is actually just a linear transformation
+of the \texttt{Weight\ (kg)} column. Thus, no new information is
+captured, and the matrix of our dataset has a (column) rank of 3!
+Therefore, despite having 4 columns, we still say that this data is
+3-dimensional.
+
+Plotting the weight columns together reveals the key visual intuition.
+While the two columns visually span a 2D space as a line, the data does
+not deviate at all from that singular line. This means that one of the
+weight columns is redundant! Even given the option to cover the whole 2D
+space, the data below does not. It might as well not have this
+dimension, which is why we still do not consider the data below to span
+more than 1 dimension.
+
+What happens when there are outliers? Below, we've added one outlier
+point to the dataset above, and just that one point is enough to change
+the rank of the matrix from 1 to 2 dimensions. However, the data is
+still \emph{approximately} 1-dimensional.
+
+\textbf{Dimensionality reduction} is generally an \textbf{approximation
+of the original data} that's achieved by \textbf{projecting} the data
+onto a desired dimension. In the example below, our original datapoints
+(blue dots) are 2-dimensional. We have a few choices if we want to
+project them down to 1-dimension: project them onto the \(x\)-axis
+(left), project them onto the \(y\)-axis (middle), or project them to a
+line \(mx + b\) (right). The resulting datapoints after the projection
+is shown in red. Which projection do you think is better? How can we
+calculate that?
+
+In general, we want the projection which is the best approximation for
+the original data (the graph on the right). In other words, we want the
+projection that \emph{captures the most variance} of the original data.
+In the next section, we'll see how this is calculated.
+
+\section{Matrix Decomposition
+(Factorization)}\label{matrix-decomposition-factorization}
+
+One linear technique for dimensionality reduction is matrix
+decomposition, which is closely tied to matrix multiplication. In this
+section, we will decompose our data matrix \(X\) into a
+lower-dimensional matrix \(Z\) that approximately recovers the original
+data when multiplied by \(W\).
+
+First, consider the matrix multiplication example below:
+
+\begin{itemize}
+\tightlist
+\item
+  For table 1, each row of the fruits matrix represents one bowl of
+  fruit; for example, the first bowl/row has 2 apples, 2 lemons, and 2
+  melons.
+\item
+  For table 2, each column of the dollars matrix represents the cost of
+  fruit at a store; for example, the first store/column charges 2
+  dollars for an apple, 1 dollar for a lemon, and 4 dollars for a melon.
+\item
+  The output is the cost of each bowl at each store.
+\end{itemize}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{Linear Algebra Review: Matrix Multiplication}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+In general, there are two ways to interpret matrix multiplication:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Each \emph{datapoint} in our output is a \emph{dot product} between a
+  row in the data matrix and a column in the operations matrix. In this
+  view, we perform multiple linear operations on the data
+\item
+  Each \emph{column} in our output is a \emph{linear transformation} of
+  the original columns based on a column in the transformation matrix
+\end{enumerate}
+
+We will use the second interpretation to link matrix multiplication with
+matrix decomposition, where we receive a lower dimensional
+representation of data along with a transformation matrix.
+
+\end{tcolorbox}
+
+\textbf{Matrix decomposition} (a.k.a \textbf{matrix factorization}) is
+the opposite of matrix multiplication. Instead of multiplying two
+matrices, we want to \emph{decompose} a single matrix into 2 separate
+matrices. Just like with real numbers, there are infinite ways to
+decompose a matrix into a product of two matrices. For example, \(9.9\)
+can be decomposed as \(1.1 * 9\), \(3.3 * 3.3\), \(1 * 9.9\), etc.
+Additionally, the sizes of the 2 decomposed matrices can vary
+drastically. In the example below, the first factorization (top)
+multiplies a \(3x2\) matrix by a \(2x3\) matrix while the second
+factorization (bottom) multiplies a \(3x3\) matrix by a \(3x3\) matrix;
+both result in the original matrix on the right.
+
+We can even expand the \(3x3\) matrices to \(3x4\) and \(4x3\) (shown
+below as the factorization on top), but this defeats the point of
+dimensionality reduction since we're adding more ``useless'' dimensions.
+On the flip side, we also can't reduce the dimension to \(3x1\) and
+\(1x3\) (shown below as the factorization on the bottom); since the rank
+of the original matrix is greater than 1, this decomposition will not
+result in the original matrix.
+
+In practice, we often work with datasets containing many features, so we
+usually want to construct decompositions where the dimensionality is
+below the rank of the original matrix. While this does not recover the
+data exactly, we can still provide \emph{approximate reconstructions} of
+the matrix.
+
+In the next section, we will discuss a method to automatically and
+approximately factorize data. This avoids redundant features and makes
+computation easier because we can train on less data. Since some
+approximations are better than others, we will also discuss how the
+method helps us capture a lot of information in a low number of
+dimensions.
+
+\section{Principal Component Analysis
+(PCA)}\label{principal-component-analysis-pca}
+
+In PCA, our goal is to transform observations from high-dimensional data
+down to low dimensions (often 2, as most visualizations are 2D) through
+linear transformations. In other words, we want to find a linear
+transformation that creates a low-dimension representation that captures
+as much of the original data's \emph{total variance} as possible.
+
+We often perform PCA during the Exploratory Data Analysis (EDA) stage of
+our data science lifecycle when we don't know what model to use. It
+helps us with:
+
+\begin{itemize}
+\tightlist
+\item
+  Visually identifying clusters of similar observations in high
+  dimensions.
+\item
+  Removing irrelevant dimensions if we suspect that the dataset is
+  inherently low rank. For example, if the columns are collinear, there
+  are many attributes, but only a few mostly determine the rest through
+  linear associations.
+\item
+  Creating a transformed dataset of decorrelated features.
+\end{itemize}
+
+There are two equivalent ways of framing PCA:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Finding directions of \textbf{maximum variability} in the data.
+\item
+  Finding the low dimensional (rank) matrix factorization that
+  \textbf{best approximates the data}.
+\end{enumerate}
+
+To execute the first approach of \textbf{variance maximization} framing
+(more common), we can find the variances of each attribute with
+\texttt{np.var} and then keep the \(k\) attributes with the highest
+variance. However, this approach limits us to work with attributes
+individually; it cannot resolve collinearity, and we cannot combine
+features.
+
+The second approach uses PCA to construct \textbf{principal components}
+with the most variance in the data (even higher than the first approach)
+using \textbf{linear combinations of features}. We'll describe the
+procedure in the next section.
+
+\subsection{PCA Procedure (Overview)}\label{pca-procedure-overview}
+
+To perform PCA on a matrix:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \textbf{Center} the data matrix by subtracting the mean of each
+  attribute column.
+\item
+  To find the \(i\)-th \textbf{principal component}, \(v_i\):
+
+  \begin{enumerate}
+  \def\labelenumii{\arabic{enumii}.}
+  \tightlist
+  \item
+    \(v\) is a \textbf{unit vector} that linearly combines the
+    attributes.
+  \item
+    \(v\) gives a one-dimensional projection of the data.
+  \item
+    \(v\) is chosen to \textbf{maximize the variance} along the
+    projection onto \(v\). This is equivalent to \textbf{minimizing the
+    sum of squared distances} between each point and its projection onto
+    \(v\).
+  \item
+    Choose \(v\) such that it is orthogonal to all previous principal
+    components.
+  \end{enumerate}
+\end{enumerate}
+
+The \(k\) principal components capture the most variance of any
+\(k\)-dimensional reduction of the data matrix.
+
+In practice, however, we don't carry out the procedures in step 2
+because they take too long to compute. Instead, we use singular value
+decomposition (SVD) to find all principal components efficiently.
+
+\subsection{Deriving PCA as Error
+Minimization}\label{deriving-pca-as-error-minimization}
+
+In this section, we will derive PCA keeping the following goal in mind:
+minimize the reconstruction loss for our matrix factorization model. You
+are not expected to be able to be able to redo this derivation, but
+understanding the derivation may help with future assignments.
+
+Given a matrix \(X\) with \(n\) rows and \(d\) columns, our goal is to
+find its best decomposition such that \[X \approx Z W\] Z has \(n\) rows
+and \(k\) columns; W has \(k\) rows and \(d\) columns.
+
+To measure the accuracy of our reconstruction, we define the
+\textbf{reconstruction loss} below, where \(X_i\) is the row vector of
+\(X\), and \(Z_i\) is the row vector of \(Z\):
+
+There are many solutions to the above, so let's constrain our model such
+that \(W\) is a \textbf{row-orthonormal matrix} (i.e.~\(WW^T=I\)) where
+the rows of \(W\) are our principal components.
+
+In our derivation, let's first work with the case where \(k=1\). Here Z
+will be an \(n \times 1\) vector and W will be a \(1 \times d\) vector.
+
+\[\begin{aligned}
+L(z,w) &= \frac{1}{n}\sum_{i=1}^{n}(X_i - z_{i}w)(X_i - z_{i}w)^T \\
+&= \frac{1}{n}\sum_{i=1}^{n}(X_{i}X_{i}^T - 2z_{i}X_{i}w^T + z_{i}^{2}ww^T) & \text{(expand the loss)} \\
+= \frac{1}{n}\sum_{i=1}^{n}(-2z_{i}X_{i}w^T + z_{i}^{2}) & \text{(First term is constant and }ww^T=1\text{ by orthonormality)} \\
+\end{aligned}\]
+
+Now, we can take the derivative with respect to \(Z_i\).
+\[\begin{aligned}
+\frac{\partial{L(Z,W)}}{\partial{z_i}} &= \frac{1}{n}(-2X_{i}w^T + 2z_{i}) \\
+z_i &= X_iw^T & \text{(Setting derivative equal to 0 and solving for }z_i\text{)}\end{aligned}\]
+
+We can now substitute our solution for \(z_i\) in our loss function:
+
+\[\begin{aligned}
+L(z,w) &= \frac{1}{n}\sum_{i=1}^{n}(-2z_{i}X_{i}w^T + z_{i}^{2}) \\
+L(z=X_iw^T,w) &= \frac{1}{n}\sum_{i=1}^{n}(-2X_iw^TX_{i}w^T + (X_iw^T)^{2}) \\
+&= \frac{1}{n}\sum_{i=1}^{n}(-X_iw^TX_{i}w^T) \\
+&= \frac{1}{n}\sum_{i=1}^{n}(-wX_{i}^TX_{i}w^T) \\
+&= -w\frac{1}{n}\sum_{i=1}^{n}(X_i^TX_{i})w^T \\
+&= -w\Sigma w^T
+\end{aligned}\]
+
+Now, we need to minimize our loss with respect to \(w\). Since we have a
+negative sign, one way we can do this is by making \(w\) really big.
+However, we also have the orthonormality constraint \(ww^T=1\). To
+incorporate this constraint into the equation, we can add a Lagrange
+multiplier, \(\lambda\). Note that lagrangian multipliers are out of
+scope for Data 100.
+
+\[
+L(w,\lambda) = -w\Sigma w^T + \lambda(ww^T-1) 
+\]
+
+Taking the derivative with respect to \(w\), \[\begin{aligned}
+\frac{\partial{L(w,\lambda)}}{w} &= -2\Sigma w^T + 2\lambda w^T \\
+2\Sigma w^T - 2\lambda w^T &= 0 & \text{(Setting derivative equal to 0)} \\
+\Sigma w^T &= \lambda w^T \\
+\end{aligned}\]
+
+This result implies that:
+
+\begin{itemize}
+\tightlist
+\item
+  \(w\) is a \textbf{unitary eigenvector} of the covariance matrix. This
+  means that \(||w||^2 = ww^T = 1\)
+\item
+  The error is minimized when \(w\) is the eigenvector with the largest
+  eigenvalue \(\lambda\).
+\end{itemize}
+
+This derivation can inductively be used for the next (second) principal
+component (not shown).
+
+The final takeaway from this derivation is that the \textbf{principal
+components} are the \textbf{eigenvectors} with the \textbf{largest
+eigenvalues} of the \textbf{covariance matrix}. These are the
+\textbf{directions} of the \textbf{maximum variance} of the data. We can
+construct the \textbf{latent factors (the Z matrix)} by
+\textbf{projecting} the centered data X onto the principal component
+vectors:
+
+\bookmarksetup{startatroot}
+
+\chapter{PCA II}\label{pca-ii}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Dissect Singular Value Decomposition (SVD) and use it to calculate
+  principal components
+\item
+  Develop a deeper understanding of how to interpret Principal Component
+  Analysis (PCA)
+\item
+  See applications of PCA in some real-world contexts
+\end{itemize}
+
+\end{tcolorbox}
+
+\section{Dimensionality Reduction}\label{dimensionality-reduction}
+
+We often work with high-dimensional data that contain \emph{many}
+columns/features. Given all these dimensions, this data can be difficult
+to visualize and model. However, not all the data in this
+high-dimensional space is useful ------ there could be repeated features
+or outliers that make the data seem more complex than it really is. The
+most concise representation of high-dimensional data is its
+\textbf{intrinsic dimension}. Our goal with this lecture is to use
+\textbf{dimensionality reduction} to find the intrinsic dimension of a
+high-dimensional dataset. In other words, we want to find a smaller set
+of new features/columns that approximates the original data well without
+losing that much information. This is especially useful because this
+smaller set of features allows us to better visualize the data and do
+EDA to understand which modeling techniques would fit the data well.
+
+\subsection{Loss Minimization}\label{loss-minimization}
+
+In order to find the intrinsic dimension of a high-dimensional dataset,
+we'll use techniques from linear algebra. Suppose we have a
+high-dimensional dataset, \(X\), that has \(n\) rows and \(d\) columns.
+We want to factor (split) \(X\) into two matrices, \(Z\) and \(W\).
+\(Z\) has \(n\) rows and \(k\) columns; \(W\) has \(k\) rows and \(d\)
+columns.
+
+\[ X \approx ZW\]
+
+We can reframe this problem as a loss function: in other words, if we
+want \(X\) to roughly equal \(ZW\), their difference should be as small
+as possible, ideally 0. This difference becomes our loss function,
+\(L(Z, W)\):
+
+\[L(Z, W) = \frac{1}{n}\sum_{i=1}^{n}||X_i - Z_iW||^2\]
+
+Breaking down the variables in this formula:
+
+\begin{itemize}
+\tightlist
+\item
+  \(X_i\): A row vector from the original data matrix \(X\), which we
+  can assume is centered to a mean of 0.
+\item
+  \(Z_i\): A row vector from the lower-dimension matrix \(Z\). The rows
+  of \(Z\) are also known as \textbf{latent vectors} and are used for
+  EDA.
+\item
+  \(W\): The rows of \(W\) are the \textbf{principal components}. We
+  constrain our model so that \(W\) is a row-orthonormal matrix (e.g.,
+  \(WW^T = I\)).
+\end{itemize}
+
+Using calculus and optimization techniques (take EECS 127 if you're
+interested!), we find that this loss is minimized when \[Z = XW^T\] The
+proof for this is out of scope for Data 100, but for those who are
+interested, we:
+
+\begin{itemize}
+\tightlist
+\item
+  Use Lagrangian multipliers to introduce the orthonormality constraint
+  on \(W\).
+\item
+  Took the derivative with respect to \(W\) (which requires vector
+  calculus) and solve for 0.
+\end{itemize}
+
+This gives us a very cool result of
+
+\[\Sigma w^T = \lambda w^T\]
+
+\(\Sigma\) is the covariance matrix of \(X\). The equation above implies
+that:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \(w\) is a \textbf{unitary eigenvector} of the covariance matrix
+  \(\Sigma\). In other words, its norm is equal to 1:
+  \(||w||^2 = ww^T = 1\).
+\item
+  The loss is minimized when \(w\) is the eigenvector with the
+  \textbf{largest eigenvalue} \(\lambda\).
+\end{enumerate}
+
+This tells us that the principal components (rows of \(W\)) are the
+eigenvectors with the largest eigenvalues of the covariance matrix
+\(\Sigma\). They represent the directions of \textbf{maximum variance}
+in the data. We can construct the latent factors, or the \(Z\) matrix,
+by projecting the centered data \(X\) onto the principal component
+vectors, \(W^T\).
+
+But how do we compute the eigenvectors of \(\Sigma\)? Let's dive into
+SVD to answer this question.
+
+\section{Singular Value Decomposition
+(SVD)}\label{singular-value-decomposition-svd}
+
+Singular value decomposition (SVD) is an important concept in linear
+algebra. Since this class requires a linear algebra course (MATH 54,
+MATH 56, or EECS 16A) as a pre/co-requisite, we assume you have taken or
+are taking a linear algebra course, so we won't explain SVD in its
+entirety. In particular, we will go over:
+
+\begin{itemize}
+\tightlist
+\item
+  Why SVD is a valid decomposition of rectangular matrices
+\item
+  Why PCA is an application of SVD
+\end{itemize}
+
+We will not dive deep into the theory and details of SVD. Instead, we
+will only cover what is needed for a data science interpretation. If
+you'd like more information, check out
+\href{https://inst.eecs.berkeley.edu/~ee16b/sp23/notes/sp23/note14.pdf}{EECS
+16B Note 14} or
+\href{https://inst.eecs.berkeley.edu/~ee16b/sp23/notes/sp23/note15.pdf}{EECS
+16B Note 15}.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{{[}Linear Algebra Review{]} Orthonormality}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+Orthonormal is a combination of two words: orthogonal and normal.
+
+When we say the columns of a matrix are orthonormal, we know that:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  The columns are all orthogonal to each other (all pairs of columns
+  have a dot product of zero)
+\item
+  All columns are unit vectors (the length of each column vector is 1)
+\end{enumerate}
+
+Orthonormal matrices have a few important properties:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Orthonormal inverse}: If an \(m \times n\) matrix \(Q\) has
+  orthonormal columns, \(QQ^T= Iₘ\) and \(Q^TQ=Iₙ\).
+\item
+  \textbf{Rotation of coordinates}: The linear transformation
+  represented by an orthonormal matrix is often a rotation (and less
+  often a reflection). We can imagine columns of the matrix as where the
+  unit vectors of the original space will land.
+\end{itemize}
+
+\end{tcolorbox}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{{[}Linear Algebra Review{]} Diagonal Matrices}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\textbf{Diagonal matrices} are square matrices with non-zero values on
+the diagonal axis and zeros everywhere else.
+
+Right-multiplied diagonal matrices scale each column up or down by a
+constant factor. Geometrically, this transformation can be viewed as
+scaling the coordinate system.
+
+\end{tcolorbox}
+
+Singular value decomposition (SVD) describes a matrix \(X\)'s
+decomposition into three matrices: \[ X = U S V^T \]
+
+Let's break down each of these terms one by one.
+
+\subsection{\texorpdfstring{\(U\)}{U}}\label{u}
+
+\begin{itemize}
+\tightlist
+\item
+  \(U\) is an \(n \times d\) matrix: \(U \in \mathbb{R}^{n \times d}\).
+\item
+  Its columns are \textbf{orthonormal}.
+
+  \begin{itemize}
+  \tightlist
+  \item
+    \(\vec{u_i}^T\vec{u_j} = 0\) for all pairs \(i, j\).
+  \item
+    All vectors \(\vec{u_i}\) are unit vectors where
+    \(|| \vec{u_i} || = 1\) .
+  \end{itemize}
+\item
+  Columns of U are called the \textbf{left singular vectors} and are
+  \textbf{eigenvectors} of \(XX^T\).
+\item
+  \(UU^T = I_n\) and \(U^TU = I_d\).
+\item
+  We can think of \(U\) as a rotation.
+\end{itemize}
+
+\subsection{\texorpdfstring{\(S\)}{S}}\label{s}
+
+\begin{itemize}
+\tightlist
+\item
+  \(S\) is a \(d \times d\) matrix: \(S \in \mathbb{R}^{d \times d}\).
+\item
+  The majority of the matrix is zero.
+\item
+  It has \(r\) \textbf{non-zero} \textbf{singular values}, and \(r\) is
+  the rank of \(X\). Note that rank \(r \leq d\).
+\item
+  Diagonal values (\textbf{singular values} \(s_1, s_2, ... s_r\)), are
+  \textbf{non-negative} ordered from largest to smallest:
+  \(s_1 \ge s_2 \ge ... \ge s_r > 0\).
+\item
+  We can think of \(S\) as a scaling operation.
+\end{itemize}
+
+\subsection{\texorpdfstring{\(V^T\)}{V\^{}T}}\label{vt}
+
+\begin{itemize}
+\tightlist
+\item
+  \(V^T\) is an \(d \times d\) matrix:
+  \(V \in \mathbb{R}^{d \times d}\).
+\item
+  Columns of \(V\) are orthonormal, so the rows of \(V^T\) are
+  orthonormal.
+\item
+  Columns of \(V\) are called the \textbf{right singular vectors}, and
+  similarly to \(U\), are \textbf{eigenvectors} of \(X^TX\).
+\item
+  \(VV^T = V^TV = I_d\)
+\item
+  We can think of \(V\) as a rotation.
+\end{itemize}
+
+\subsection{\texorpdfstring{SVD in
+\texttt{NumPy}}{SVD in NumPy}}\label{svd-in-numpy}
+
+For this demo, we'll work with a rectangular dataset containing
+\(n=100\) rows and \(d=4\) columns.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+
+\NormalTok{np.random.seed(}\DecValTok{23}\NormalTok{)  }\CommentTok{\# kallisti}
+
+\NormalTok{plt.rcParams[}\StringTok{"figure.figsize"}\NormalTok{] }\OperatorTok{=}\NormalTok{ (}\DecValTok{4}\NormalTok{, }\DecValTok{4}\NormalTok{)}
+\NormalTok{plt.rcParams[}\StringTok{"figure.dpi"}\NormalTok{] }\OperatorTok{=} \DecValTok{150}
+\NormalTok{sns.}\BuiltInTok{set}\NormalTok{()}
+
+\NormalTok{rectangle }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/rectangle\_data.csv"}\NormalTok{)}
+\NormalTok{rectangle.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& width & height & area & perimeter \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 8 & 6 & 48 & 28 \\
+1 & 2 & 4 & 8 & 12 \\
+2 & 1 & 3 & 3 & 8 \\
+3 & 9 & 3 & 27 & 24 \\
+4 & 9 & 8 & 72 & 34 \\
+\end{longtable}
+
+In \texttt{NumPy}, the SVD decomposition function can be called with
+\texttt{np.linalg.svd}
+(\href{https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html}{documentation}).
+There are multiple versions of SVD; to get the version that we will
+follow, we need to set the \texttt{full\_matrices} parameter to
+\texttt{False}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{U, S, Vt }\OperatorTok{=}\NormalTok{ np.linalg.svd(rectangle, full\_matrices}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+First, let's examine \texttt{U}. As we can see, it's dimensions are
+\(n \times d\).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{U.shape}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(100, 4)
+\end{verbatim}
+
+The first 5 rows of \texttt{U} are shown below.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.DataFrame(U).head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& 0 & 1 & 2 & 3 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & -0.155151 & 0.064830 & -0.029935 & 0.934418 \\
+1 & -0.038370 & -0.089155 & 0.062019 & -0.299462 \\
+2 & -0.020357 & -0.081138 & 0.058997 & 0.006852 \\
+3 & -0.101519 & -0.076203 & -0.148160 & -0.011848 \\
+4 & -0.218973 & 0.206423 & 0.007274 & -0.056580 \\
+\end{longtable}
+
+\(S\) is a little different in \texttt{NumPy}. Since the only useful
+values in the diagonal matrix \(S\) are the singular values on the
+diagonal axis, only those values are returned and they are stored in an
+array.
+
+Our \texttt{rectangle\_data} has a rank of \(3\), so we should have 3
+non-zero singular values, \textbf{sorted from largest to smallest}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{S}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([3.62932568e+02, 6.29904732e+01, 2.56544651e+01, 2.56364534e-14])
+\end{verbatim}
+
+It seems like we have 4 non-zero values instead of 3, but notice that
+the last value is so small (\(10^{-15}\)) that it's practically \(0\).
+Hence, we can round the values to get 3 singular values.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{np.}\BuiltInTok{round}\NormalTok{(S)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([363.,  63.,  26.,   0.])
+\end{verbatim}
+
+To get \texttt{S} in matrix format, we use \texttt{np.diag}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{Sm }\OperatorTok{=}\NormalTok{ np.diag(S)}
+\NormalTok{Sm}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+array([[3.62932568e+02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+       [0.00000000e+00, 6.29904732e+01, 0.00000000e+00, 0.00000000e+00],
+       [0.00000000e+00, 0.00000000e+00, 2.56544651e+01, 0.00000000e+00],
+       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.56364534e-14]])
+\end{verbatim}
+
+Finally, we can see that \texttt{Vt} is indeed a \(d \times d\) matrix.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{Vt.shape}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(4, 4)
+\end{verbatim}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.DataFrame(Vt)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& 0 & 1 & 2 & 3 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & -0.146436 & -0.129942 & -8.100201e-01 & -0.552756 \\
+1 & -0.192736 & -0.189128 & 5.863482e-01 & -0.763727 \\
+2 & -0.704957 & 0.709155 & 7.951614e-03 & 0.008396 \\
+3 & -0.666667 & -0.666667 & 9.775109e-17 & 0.333333 \\
+\end{longtable}
+
+To check that this SVD is a valid decomposition, we can reverse it and
+see if it matches our original table (it does, yay!).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{pd.DataFrame(U }\OperatorTok{@}\NormalTok{ Sm }\OperatorTok{@}\NormalTok{ Vt).head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& 0 & 1 & 2 & 3 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 8.0 & 6.0 & 48.0 & 28.0 \\
+1 & 2.0 & 4.0 & 8.0 & 12.0 \\
+2 & 1.0 & 3.0 & 3.0 & 8.0 \\
+3 & 9.0 & 3.0 & 27.0 & 24.0 \\
+4 & 9.0 & 8.0 & 72.0 & 34.0 \\
+\end{longtable}
+
+\section{PCA with SVD}\label{pca-with-svd}
+
+Principal Component Analysis (PCA) and Singular Value Decomposition
+(SVD) can be easily mixed up, especially when you have to keep track of
+so many acronyms. Here is a quick summary:
+
+\begin{itemize}
+\tightlist
+\item
+  SVD: a linear algebra algorithm that splits a matrix into 3 component
+  parts.
+\item
+  PCA: a data science procedure used for dimensionality reduction that
+  \emph{uses} SVD as one of the steps.
+\end{itemize}
+
+\subsection{Deriving Principal Components From
+SVD}\label{deriving-principal-components-from-svd}
+
+After centering the original data matrix \(X\) so that each column has a
+mean of 0, we find its SVD: \[ X = U S V^T \]
+
+Because \(X\) is centered, the covariance matrix of \(X\), \(\Sigma\),
+is equal to \(X^T X\). Rearranging this equation, we get
+
+\[
+\begin{align}
+\Sigma &= X^T X \\
+ &= (U S V^T)^T U S V^T \\
+ &= V S^T U^T U S V^T & \text{U is orthonormal, so $U^T U = I$} \\
+ &= V S^2 V^T
+\end{align}
+\]
+
+Multiplying both sides by \(V\), we get
+
+\[
+\begin{align}
+\Sigma V &= VS^2 V^T V \\
+&= V S^2
+\end{align}
+\]
+
+This shows that the columns of \(V\) are the \textbf{eigenvectors} of
+the covariance matrix \(\Sigma\) and, therefore, the \textbf{principal
+components}. Additionally, the squared singular values \(S^2\) are the
+\textbf{eigenvalues} of \(\Sigma\).
+
+We've now shown that the first \(k\) columns of \(V\) (equivalently, the
+first \(k\) rows of \(V^{T}\)) are the first k principal components of
+\(X\). We can use them to construct the \textbf{latent vector
+representation} of \(X\), \(Z\), by projecting \(X\) onto the principal
+components.
+
+We can then instead compute \(Z\) as follows:
+
+\[
+\begin{align}
+Z &= X V \\
+ &= USV^T V \\
+ &= U S
+\end{align}
+\]
+
+\[Z = XV = US\]
+
+In other words, we can construct \(X\)'s' latent vector representation
+\(Z\) through:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Projecting \(X\) onto the first \(k\) columns of \(V\), \(V[:, :k]\)
+\item
+  Multiplying the first \(k\) columns of U and the first \(k\) rows of S
+\end{enumerate}
+
+Using \(Z\), we can approximately recover the centered \(X\) matrix by
+multiplying \(Z\) by \(V^T\): \[ Z V^T = XV V^T = USV^T = X\]
+
+Note that to recover the original (uncentered) \(X\) matrix, we would
+also need to add back the mean.
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-tip-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-tip-color}{\faLightbulb}\hspace{0.5em}{\hyperref[summary]{Summary} Terminology}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-tip-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\textbf{Note}: The notation used for PCA this semester differs from
+previous semesters a bit. Please bay careful attention to the
+terminology presented in this note.
+
+To summarize the terminology and concepts we've covered so far:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  \textbf{Principal Component}: The columns of \(V\) . These vectors
+  specify the principal coordinate system and represent the directions
+  along which the most variance in the data is captured.
+\item
+  \textbf{Latent Vector Representation} of \(X\): The projection of our
+  data matrix \(X\) onto the principal components, \(Z = XV = US\). In
+  previous semesters, the terminology was different and this was termed
+  the principal components of \(X\). In other classes, the term
+  principal coordinate is also used. The \(i\)-th latent vector refers
+  to the \(i\)-th column of \(V\), corresponding to the \(i\)-th largest
+  singular value of \(X\).
+\item
+  \textbf{\(S\)} (as in SVD): The diagonal matrix containing all the
+  singular values of \(X\).
+\item
+  \textbf{\(\Sigma\)}: The covariance matrix of \(X\). Assuming \(X\) is
+  centered, \(\Sigma = X^T X\). In previous semesters, the singular
+  value decomposition of \(X\) was written out as \(X = U{\Sigma}V^T\).
+  Note the difference between \(\Sigma\) in that context compared to
+  this semester.
+\end{enumerate}
+
+\end{tcolorbox}
+
+\subsection{PCA Visualization}\label{pca-visualization}
+
+As we discussed above, when conducting PCA, we first center the data
+matrix \(X\) and then rotate it such that the direction with the most
+variation (e.g., the direction that is most spread out) aligns with the
+x-axis.
+
+In particular, the elements of each column of \(V\) (row of \(V^{T}\))
+rotate the original feature vectors, projecting \(X\) onto the principal
+components.
+
+The first column of \(V\) indicates how each feature contributes
+(e.g.~positive, negative, etc.) to principal component 1; it essentially
+assigns ``weights'' to each feature.
+
+Coupled together, this interpretation also allows us to understand that:
+
+\begin{itemize}
+\tightlist
+\item
+  The principal components are all \textbf{orthogonal} to each other
+  because the columns of \(V\) are orthonormal.
+\item
+  Principal components are \textbf{axis-aligned}. That is, if you plot
+  two PCs on a 2D plane, one will lie on the x-axis and the other on the
+  y-axis.
+\item
+  Principal components are \textbf{linear combinations} of columns in
+  our data \(X\).
+\end{itemize}
+
+\subsection{Using Principal
+Components}\label{using-principal-components}
+
+Let's summarize the steps to obtain Principal Components via SVD:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  Center the data matrix \(X\) by subtracting the mean of each attribute
+  column.
+\item
+  To find the \(k\) \textbf{principal components}:
+
+  \begin{enumerate}
+  \def\labelenumii{\arabic{enumii}.}
+  \tightlist
+  \item
+    Compute the SVD of the data matrix (\(X = U{S}V^{T}\)).
+  \item
+    The first \(k\) columns of \(V\) contain the \(k\) \textbf{principal
+    components} of \(X\). The \(k\)-th column of \(V\) is also known as
+    the \(k\)-th latent vector and corresponds to the \(k\)-th largest
+    singular value of \(X\).
+  \end{enumerate}
+\end{enumerate}
+
+\subsection{Code Demo}\label{code-demo}
+
+Let's now walk through an example where we compute PCA using SVD. In
+order to get the first \(k\) principal components from an \(n \times d\)
+matrix \(X\), we:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Center \(X\) by subtracting the mean from each column. Notice how we
+  specify \texttt{axis=0} so that the mean is computed per column.
+\end{enumerate}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{centered\_df }\OperatorTok{=}\NormalTok{ rectangle }\OperatorTok{{-}}\NormalTok{ np.mean(rectangle, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+\NormalTok{centered\_df.head(}\DecValTok{5}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule\noalign{}
+& width & height & area & perimeter \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & 2.97 & 1.35 & 24.78 & 8.64 \\
+1 & -3.03 & -0.65 & -15.22 & -7.36 \\
+2 & -4.03 & -1.65 & -20.22 & -11.36 \\
+3 & 3.97 & -1.65 & 3.78 & 4.64 \\
+4 & 3.97 & 3.35 & 48.78 & 14.64 \\
+\end{longtable}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{1}
+\tightlist
+\item
+  Get the Singular Value Decomposition of the centered \(X\): \(U\),
+  \(S\) and \(V^T\)
+\end{enumerate}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{U, S, Vt }\OperatorTok{=}\NormalTok{ np.linalg.svd(centered\_df, full\_matrices}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+\NormalTok{Sm }\OperatorTok{=}\NormalTok{ pd.DataFrame(np.diag(np.}\BuiltInTok{round}\NormalTok{(S, }\DecValTok{1}\NormalTok{)))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{2}
+\tightlist
+\item
+  Take the first \(k\) columns of \(V\). These are the first \(k\)
+  principal components of \(X\).
+\end{enumerate}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{two\_PCs }\OperatorTok{=}\NormalTok{ Vt.T[:, :}\DecValTok{2}\NormalTok{]}
+\NormalTok{pd.DataFrame(two\_PCs).head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}lll@{}}
+\toprule\noalign{}
+& 0 & 1 \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & -0.098631 & 0.668460 \\
+1 & -0.072956 & -0.374186 \\
+2 & -0.931226 & -0.258375 \\
+3 & -0.343173 & 0.588548 \\
+\end{longtable}
+
+\section{Data Variance and Centering}\label{data-variance-and-centering}
+
+We define the total variance of a data matrix as the sum of variances of
+attributes. The principal components are a low-dimension representation
+that capture as much of the original data's total variance as possible.
+Formally, the \(i\)-th singular value tells us the \textbf{component
+score}, or how much of the data variance is captured by the \(i\)-th
+principal component. Assuming the number of datapoints is \(n\):
+
+\[\text{i-th component score} = \frac{(\text{i-th singular value}^2)}{n}\]
+
+Summing up the component scores is equivalent to computing the total
+variance \emph{if we center our data}.
+
+\textbf{Data Centering}: PCA has a data-centering step that precedes any
+singular value decomposition, where, if implemented, the component score
+is defined as above.
+
+If you want to dive deeper into PCA,
+\href{https://www.youtube.com/playlist?list=PLMrJAkhIeNNSVjnsviglFoY2nXildDCcv}{Steve
+Brunton's SVD Video Series} is a great resource.
+
+\section{Interpreting PCA}\label{interpreting-pca}
+
+\subsection{PCA Plot}\label{pca-plot}
+
+We often plot the first two principal components using a scatter plot,
+with PC1 on the \(x\)-axis and PC2 on the \(y\)-axis. This is often
+called a PCA plot.
+
+If the first two singular values are large and all others are small,
+then two dimensions are enough to describe most of what distinguishes
+one observation from another. If not, a PCA plot omits a lot of
+information.
+
+PCA plots help us assess similarities between our data points and if
+there are any clusters in our dataset. In the case study before, for
+example, we could create the following PCA plot:
+
+\subsection{Scree Plots}\label{scree-plots}
+
+A scree plot shows the \textbf{variance ratio} captured by each
+principal component, with the largest variance ratio first. They help us
+visually determine the number of dimensions needed to describe the data
+reasonably. The singular values that fall in the region of the plot
+after a large drop-off correspond to principal components that are
+\textbf{not} needed to describe the data since they explain a relatively
+low proportion of the total variance of the data. This point where
+adding more principal components results in diminishing returns is
+called the ``elbow'' and is the point just before the line flattens out.
+Using this ``elbow method'', we can see that the elbow is at the second
+principal component.
+
+\subsection{Biplots}\label{biplots}
+
+Biplots superimpose the directions onto the plot of PC1 vs.~PC2, where
+vector \(j\) corresponds to the direction for feature \(j\) (e.g.,
+\(v_{1j}, v_{2j}\)). There are several ways to scale biplot vectors
+------ in this course, we plot the direction itself. For other scalings,
+which can lead to more interpretable directions/loadings, see
+\href{https://blogs.sas.com/content/iml/2019/11/06/what-are-biplots.html}{SAS
+biplots}.
+
+Through biplots, we can interpret how features correlate with the
+principal components shown: positively, negatively, or not much at all.
+
+The directions of the arrow are (\(v_1\), \(v_2\)), where \(v_1\) and
+\(v_2\) are how that specific feature column contributes to PC1 and PC2,
+respectively. \(v_1\) and \(v_2\) are elements of the first and second
+columns of \(V\), respectively (i.e., the first two rows of \(V^T\)).
+
+Say we were considering feature 3, and say that was the purple arrow
+labeled ``520'' here (pointing bottom right).
+
+\begin{itemize}
+\tightlist
+\item
+  \(v_1\) and \(v_2\) are the third elements of the respective columns
+  in \(V\). They are scale feature 3's column vector in the linear
+  transformation to PC1 and PC2, respectively.
+\item
+  Here, we would infer that \(v_1\) (in the \(x\)/PC1-direction) is
+  positive, meaning that a linear increase in feature 3 would correspond
+  to a linear increase of PC1, meaning feature 3 and PC1 are positively
+  correlated.
+\item
+  \(v_2\) (in the \(y\)/pc2-direction) is negative, meaning a linear
+  increase in feature 3 would correspond to a linear decrease in PC2,
+  meaning feature 3 and PC2 are negatively correlated.
+\end{itemize}
+
+\section{Example 1: House of Representatives
+Voting}\label{example-1-house-of-representatives-voting}
+
+Let's examine how the House of Representatives (of the 116th Congress,
+1st session) voted in the month of September 2019.
+
+Specifically, we'll look at the records of Roll call votes. From the
+U.S. Senate
+(\href{https://www.senate.gov/reference/Index/Votes.htm}{link}): roll
+call votes occur when a representative or senator votes ``yea'' or
+``nay'' so that the names of members voting on each side are recorded. A
+voice vote is a vote in which those in favor or against a measure say
+``yea'' or ``nay,'' respectively, without the names or tallies of
+members voting on each side being recorded.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
+\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
+\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ yaml}
+\ImportTok{from}\NormalTok{ datetime }\ImportTok{import}\NormalTok{ datetime}
+\ImportTok{import}\NormalTok{ plotly.express }\ImportTok{as}\NormalTok{ px}
+\ImportTok{import}\NormalTok{ plotly.graph\_objects }\ImportTok{as}\NormalTok{ go}
+
+
+\NormalTok{votes }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{"data/votes.csv"}\NormalTok{)}
+\NormalTok{votes }\OperatorTok{=}\NormalTok{ votes.astype(\{}\StringTok{"roll call"}\NormalTok{: }\BuiltInTok{str}\NormalTok{\})}
+\NormalTok{votes.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llllll@{}}
+\toprule\noalign{}
+& chamber & session & roll call & member & vote \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & House & 1 & 555 & A000374 & Not Voting \\
+1 & House & 1 & 555 & A000370 & Yes \\
+2 & House & 1 & 555 & A000055 & No \\
+3 & House & 1 & 555 & A000371 & Yes \\
+4 & House & 1 & 555 & A000372 & No \\
+\end{longtable}
+
+Suppose we pivot this table to group each legislator and their voting
+pattern across every (roll call) vote in this month. We mark 1 if the
+legislator voted Yes (``yea''), and 0 otherwise (``No'', ``nay'', no
+vote, speaker, etc.).
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ was\_yes(s):}
+    \ControlFlowTok{return} \DecValTok{1} \ControlFlowTok{if}\NormalTok{ s.iloc[}\DecValTok{0}\NormalTok{] }\OperatorTok{==} \StringTok{"Yes"} \ControlFlowTok{else} \DecValTok{0}
+
+
+\NormalTok{vote\_pivot }\OperatorTok{=}\NormalTok{ votes.pivot\_table(}
+\NormalTok{    index}\OperatorTok{=}\StringTok{"member"}\NormalTok{, columns}\OperatorTok{=}\StringTok{"roll call"}\NormalTok{, values}\OperatorTok{=}\StringTok{"vote"}\NormalTok{, aggfunc}\OperatorTok{=}\NormalTok{was\_yes, fill\_value}\OperatorTok{=}\DecValTok{0}
+\NormalTok{)}
+\BuiltInTok{print}\NormalTok{(vote\_pivot.shape)}
+\NormalTok{vote\_pivot.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(441, 41)
+\end{verbatim}
+
+\begin{longtable}[]{@{}llllllllllllllllllllll@{}}
+\toprule\noalign{}
+roll call & 515 & 516 & 517 & 518 & 519 & 520 & 521 & 522 & 523 & 524 &
+... & 546 & 547 & 548 & 549 & 550 & 551 & 552 & 553 & 554 & 555 \\
+member & & & & & & & & & & & & & & & & & & & & & \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+A000055 & 1 & 0 & 0 & 0 & 1 & 1 & 0 & 1 & 1 & 1 & ... & 0 & 0 & 1 & 0 &
+0 & 1 & 0 & 0 & 1 & 0 \\
+A000367 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & ... & 0 & 1 & 1 & 1 &
+1 & 0 & 1 & 1 & 0 & 1 \\
+A000369 & 1 & 1 & 0 & 0 & 1 & 1 & 0 & 1 & 1 & 1 & ... & 0 & 0 & 1 & 0 &
+0 & 1 & 0 & 0 & 1 & 0 \\
+A000370 & 1 & 1 & 1 & 1 & 1 & 0 & 1 & 0 & 0 & 0 & ... & 1 & 1 & 1 & 1 &
+1 & 0 & 1 & 1 & 1 & 1 \\
+A000371 & 1 & 1 & 1 & 1 & 1 & 0 & 1 & 0 & 0 & 0 & ... & 1 & 1 & 1 & 1 &
+1 & 0 & 1 & 1 & 1 & 1 \\
+\end{longtable}
+
+\textbf{Do legislators' roll call votes show a relationship with their
+political party?}
+
+\subsection{PCA with SVD}\label{pca-with-svd-1}
+
+While we could consider loading information about the legislator, such
+as their party, and see how this relates to their voting pattern, it
+turns out that we can do a lot with PCA to cluster legislators by how
+they vote. Let's calculate the principal components using the SVD
+method.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{vote\_pivot\_centered }\OperatorTok{=}\NormalTok{ vote\_pivot }\OperatorTok{{-}}\NormalTok{ np.mean(vote\_pivot, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+\NormalTok{u, s, vt }\OperatorTok{=}\NormalTok{ np.linalg.svd(vote\_pivot\_centered, full\_matrices}\OperatorTok{=}\VariableTok{False}\NormalTok{) }\CommentTok{\# SVD}
+\end{Highlighting}
+\end{Shaded}
+
+We can use the singular values in \texttt{s} to construct a scree plot:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.line(y}\OperatorTok{=}\NormalTok{s}\OperatorTok{**}\DecValTok{2} \OperatorTok{/} \BuiltInTok{sum}\NormalTok{(s}\OperatorTok{**}\DecValTok{2}\NormalTok{), title}\OperatorTok{=}\StringTok{\textquotesingle{}Variance Explained\textquotesingle{}}\NormalTok{, width}\OperatorTok{=}\DecValTok{700}\NormalTok{, height}\OperatorTok{=}\DecValTok{600}\NormalTok{, markers}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{fig.update\_xaxes(title\_text}\OperatorTok{=}\StringTok{\textquotesingle{}Principal Component i\textquotesingle{}}\NormalTok{)}
+\NormalTok{fig.update\_yaxes(title\_text}\OperatorTok{=}\StringTok{\textquotesingle{}Proportion of Variance Explained\textquotesingle{}}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+It looks like this graph plateaus after the third principal component,
+so our ``elbow'' is at PC3, and most of the variance is captured by just
+the first three principal components. Let's use these PCs to visualize
+the latent vector representation of \(X\)!
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Calculate the latent vector representation (US or XV)}
+\CommentTok{\# using the first 3 principal components}
+\NormalTok{vote\_2d }\OperatorTok{=}\NormalTok{ pd.DataFrame(index}\OperatorTok{=}\NormalTok{vote\_pivot\_centered.index)}
+\NormalTok{vote\_2d[[}\StringTok{"z1"}\NormalTok{, }\StringTok{"z2"}\NormalTok{, }\StringTok{"z3"}\NormalTok{]] }\OperatorTok{=}\NormalTok{ (u }\OperatorTok{*}\NormalTok{ s)[:, :}\DecValTok{3}\NormalTok{]}
+
+\CommentTok{\# Plot the latent vector representation}
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.scatter\_3d(vote\_2d, x}\OperatorTok{=}\StringTok{\textquotesingle{}z1\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}z2\textquotesingle{}}\NormalTok{, z}\OperatorTok{=}\StringTok{\textquotesingle{}z3\textquotesingle{}}\NormalTok{, title}\OperatorTok{=}\StringTok{\textquotesingle{}Vote Data\textquotesingle{}}\NormalTok{, width}\OperatorTok{=}\DecValTok{800}\NormalTok{, height}\OperatorTok{=}\DecValTok{600}\NormalTok{)}
+\NormalTok{fig.update\_traces(marker}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(size}\OperatorTok{=}\DecValTok{5}\NormalTok{))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+Baesd on the plot above, it looks like there are two clusters of
+datapoints. What do you think this corresponds to?
+
+By incorporating member information
+(\href{https://github.com/unitedstates/congress-legislators}{source}),
+we can augment our graph with biographic data like each member's party
+and gender.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{legislators\_data }\OperatorTok{=}\NormalTok{ yaml.safe\_load(}\BuiltInTok{open}\NormalTok{(}\StringTok{"data/legislators{-}2019.yaml"}\NormalTok{))}
+
+
+\KeywordTok{def}\NormalTok{ to\_date(s):}
+    \ControlFlowTok{return}\NormalTok{ datetime.strptime(s, }\StringTok{"\%Y{-}\%m{-}}\SpecialCharTok{\%d}\StringTok{"}\NormalTok{)}
+
+
+\NormalTok{legs }\OperatorTok{=}\NormalTok{ pd.DataFrame(}
+\NormalTok{    columns}\OperatorTok{=}\NormalTok{[}
+        \StringTok{"leg\_id"}\NormalTok{,}
+        \StringTok{"first"}\NormalTok{,}
+        \StringTok{"last"}\NormalTok{,}
+        \StringTok{"gender"}\NormalTok{,}
+        \StringTok{"state"}\NormalTok{,}
+        \StringTok{"chamber"}\NormalTok{,}
+        \StringTok{"party"}\NormalTok{,}
+        \StringTok{"birthday"}\NormalTok{,}
+\NormalTok{    ],}
+\NormalTok{    data}\OperatorTok{=}\NormalTok{[}
+\NormalTok{        [}
+\NormalTok{            x[}\StringTok{"id"}\NormalTok{][}\StringTok{"bioguide"}\NormalTok{],}
+\NormalTok{            x[}\StringTok{"name"}\NormalTok{][}\StringTok{"first"}\NormalTok{],}
+\NormalTok{            x[}\StringTok{"name"}\NormalTok{][}\StringTok{"last"}\NormalTok{],}
+\NormalTok{            x[}\StringTok{"bio"}\NormalTok{][}\StringTok{"gender"}\NormalTok{],}
+\NormalTok{            x[}\StringTok{"terms"}\NormalTok{][}\OperatorTok{{-}}\DecValTok{1}\NormalTok{][}\StringTok{"state"}\NormalTok{],}
+\NormalTok{            x[}\StringTok{"terms"}\NormalTok{][}\OperatorTok{{-}}\DecValTok{1}\NormalTok{][}\StringTok{"type"}\NormalTok{],}
+\NormalTok{            x[}\StringTok{"terms"}\NormalTok{][}\OperatorTok{{-}}\DecValTok{1}\NormalTok{][}\StringTok{"party"}\NormalTok{],}
+\NormalTok{            to\_date(x[}\StringTok{"bio"}\NormalTok{][}\StringTok{"birthday"}\NormalTok{]),}
+\NormalTok{        ]}
+        \ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ legislators\_data}
+\NormalTok{    ],}
+\NormalTok{)}
+\NormalTok{legs[}\StringTok{"age"}\NormalTok{] }\OperatorTok{=} \DecValTok{2024} \OperatorTok{{-}}\NormalTok{ legs[}\StringTok{"birthday"}\NormalTok{].dt.year}
+\NormalTok{legs.set\_index(}\StringTok{"leg\_id"}\NormalTok{)}
+\NormalTok{legs.sort\_index()}
+
+\NormalTok{vote\_2d }\OperatorTok{=}\NormalTok{ vote\_2d.join(legs.set\_index(}\StringTok{"leg\_id"}\NormalTok{)).dropna()}
+
+\NormalTok{np.random.seed(}\DecValTok{42}\NormalTok{)}
+\NormalTok{vote\_2d[}\StringTok{"z1\_jittered"}\NormalTok{] }\OperatorTok{=}\NormalTok{ vote\_2d[}\StringTok{"z1"}\NormalTok{] }\OperatorTok{+}\NormalTok{ np.random.normal(}\DecValTok{0}\NormalTok{, }\FloatTok{0.1}\NormalTok{, }\BuiltInTok{len}\NormalTok{(vote\_2d))}
+\NormalTok{vote\_2d[}\StringTok{"z2\_jittered"}\NormalTok{] }\OperatorTok{=}\NormalTok{ vote\_2d[}\StringTok{"z2"}\NormalTok{] }\OperatorTok{+}\NormalTok{ np.random.normal(}\DecValTok{0}\NormalTok{, }\FloatTok{0.1}\NormalTok{, }\BuiltInTok{len}\NormalTok{(vote\_2d))}
+\NormalTok{vote\_2d[}\StringTok{"z3\_jittered"}\NormalTok{] }\OperatorTok{=}\NormalTok{ vote\_2d[}\StringTok{"z3"}\NormalTok{] }\OperatorTok{+}\NormalTok{ np.random.normal(}\DecValTok{0}\NormalTok{, }\FloatTok{0.1}\NormalTok{, }\BuiltInTok{len}\NormalTok{(vote\_2d))}
+
+\NormalTok{px.scatter\_3d(vote\_2d, x}\OperatorTok{=}\StringTok{\textquotesingle{}z1\_jittered\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}z2\_jittered\textquotesingle{}}\NormalTok{, z}\OperatorTok{=}\StringTok{\textquotesingle{}z3\_jittered\textquotesingle{}}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}party\textquotesingle{}}\NormalTok{, symbol}\OperatorTok{=}\StringTok{"gender"}\NormalTok{, size}\OperatorTok{=}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{,}
+\NormalTok{           title}\OperatorTok{=}\StringTok{\textquotesingle{}Vote Data\textquotesingle{}}\NormalTok{, width}\OperatorTok{=}\DecValTok{800}\NormalTok{, height}\OperatorTok{=}\DecValTok{600}\NormalTok{, size\_max}\OperatorTok{=}\DecValTok{10}\NormalTok{,}
+\NormalTok{           opacity }\OperatorTok{=} \FloatTok{0.7}\NormalTok{,}
+\NormalTok{           color\_discrete\_map}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}Democrat\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Republican\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, }\StringTok{"Independent"}\NormalTok{: }\StringTok{"green"}\NormalTok{\},}
+\NormalTok{           hover\_data}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}first\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}last\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}party\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}gender\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{])}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+Using SVD and PCA, we can clearly see a separation between the red dots
+(Republican) and blue dots (Deomcrat).
+
+\subsection{Exploring the Principal
+Components}\label{exploring-the-principal-components}
+
+We can also look at \(V^T\) directly to try to gain insight into why
+each component is as it is.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{fig\_eig }\OperatorTok{=}\NormalTok{ px.bar(x}\OperatorTok{=}\NormalTok{vote\_pivot\_centered.columns, y}\OperatorTok{=}\NormalTok{vt[}\DecValTok{0}\NormalTok{, :])}
+\CommentTok{\# extract the trace from the figure}
+\NormalTok{fig\_eig.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+We have the party affiliation labels so we can see if this eigenvector
+aligns with one of the parties.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{party\_line\_votes }\OperatorTok{=}\NormalTok{ (}
+\NormalTok{    vote\_pivot\_centered.join(legs.set\_index(}\StringTok{"leg\_id"}\NormalTok{)[}\StringTok{"party"}\NormalTok{])}
+\NormalTok{    .groupby(}\StringTok{"party"}\NormalTok{)}
+\NormalTok{    .mean()}
+\NormalTok{    .T.reset\_index()}
+\NormalTok{    .rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{"index"}\NormalTok{: }\StringTok{"call"}\NormalTok{\})}
+\NormalTok{    .melt(}\StringTok{"call"}\NormalTok{)}
+\NormalTok{)}
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.bar(}
+\NormalTok{    party\_line\_votes,}
+\NormalTok{    x}\OperatorTok{=}\StringTok{"call"}\NormalTok{, y}\OperatorTok{=}\StringTok{"value"}\NormalTok{, facet\_row }\OperatorTok{=} \StringTok{"party"}\NormalTok{, color}\OperatorTok{=}\StringTok{"party"}\NormalTok{,}
+\NormalTok{    color\_discrete\_map}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}Democrat\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Republican\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, }\StringTok{"Independent"}\NormalTok{: }\StringTok{"green"}\NormalTok{\})}
+\NormalTok{fig.for\_each\_annotation(}\KeywordTok{lambda}\NormalTok{ a: a.update(text}\OperatorTok{=}\NormalTok{a.text.split(}\StringTok{"="}\NormalTok{)[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+\subsection{Biplot}\label{biplot}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{loadings }\OperatorTok{=}\NormalTok{ pd.DataFrame(}
+\NormalTok{    \{}\StringTok{"pc1"}\NormalTok{: np.sqrt(s[}\DecValTok{0}\NormalTok{]) }\OperatorTok{*}\NormalTok{ vt[}\DecValTok{0}\NormalTok{, :], }\StringTok{"pc2"}\NormalTok{: np.sqrt(s[}\DecValTok{1}\NormalTok{]) }\OperatorTok{*}\NormalTok{ vt[}\DecValTok{1}\NormalTok{, :]\},}
+\NormalTok{    index}\OperatorTok{=}\NormalTok{vote\_pivot\_centered.columns,}
+\NormalTok{)}
+
+\NormalTok{vote\_2d[}\StringTok{"num votes"}\NormalTok{] }\OperatorTok{=}\NormalTok{ votes[votes[}\StringTok{"vote"}\NormalTok{].isin([}\StringTok{"Yes"}\NormalTok{, }\StringTok{"No"}\NormalTok{])].groupby(}\StringTok{"member"}\NormalTok{).size()}
+\NormalTok{vote\_2d.dropna(inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.scatter(}
+\NormalTok{    vote\_2d, }
+\NormalTok{    x}\OperatorTok{=}\StringTok{\textquotesingle{}z1\_jittered\textquotesingle{}}\NormalTok{, }
+\NormalTok{    y}\OperatorTok{=}\StringTok{\textquotesingle{}z2\_jittered\textquotesingle{}}\NormalTok{, }
+\NormalTok{    color}\OperatorTok{=}\StringTok{\textquotesingle{}party\textquotesingle{}}\NormalTok{, }
+\NormalTok{    symbol}\OperatorTok{=}\StringTok{"gender"}\NormalTok{, }
+\NormalTok{    size}\OperatorTok{=}\StringTok{\textquotesingle{}num votes\textquotesingle{}}\NormalTok{,}
+\NormalTok{    title}\OperatorTok{=}\StringTok{\textquotesingle{}Biplot\textquotesingle{}}\NormalTok{, }
+\NormalTok{    width}\OperatorTok{=}\DecValTok{800}\NormalTok{, }
+\NormalTok{    height}\OperatorTok{=}\DecValTok{600}\NormalTok{, }
+\NormalTok{    size\_max}\OperatorTok{=}\DecValTok{10}\NormalTok{,}
+\NormalTok{    opacity }\OperatorTok{=} \FloatTok{0.7}\NormalTok{,}
+\NormalTok{    color\_discrete\_map}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}Democrat\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Republican\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, }\StringTok{"Independent"}\NormalTok{: }\StringTok{"green"}\NormalTok{\},}
+\NormalTok{    hover\_data}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}first\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}last\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}party\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}gender\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{])}
+
+\ControlFlowTok{for}\NormalTok{ (call, pc1, pc2) }\KeywordTok{in}\NormalTok{ loadings.head(}\DecValTok{20}\NormalTok{).itertuples():}
+\NormalTok{    fig.add\_scatter(x}\OperatorTok{=}\NormalTok{[}\DecValTok{0}\NormalTok{,pc1], y}\OperatorTok{=}\NormalTok{[}\DecValTok{0}\NormalTok{,pc2], name}\OperatorTok{=}\NormalTok{call, }
+\NormalTok{                    mode}\OperatorTok{=}\StringTok{\textquotesingle{}lines+markers\textquotesingle{}}\NormalTok{, textposition}\OperatorTok{=}\StringTok{\textquotesingle{}top right\textquotesingle{}}\NormalTok{,}
+\NormalTok{                    marker}\OperatorTok{=} \BuiltInTok{dict}\NormalTok{(size}\OperatorTok{=}\DecValTok{10}\NormalTok{,symbol}\OperatorTok{=} \StringTok{"arrow{-}bar{-}up"}\NormalTok{, angleref}\OperatorTok{=}\StringTok{"previous"}\NormalTok{))}
+\NormalTok{fig}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+Each roll call from the 116th Congress - 1st Session:
+https://clerk.house.gov/evs/2019/ROLL\_500.asp
+
+\begin{itemize}
+\tightlist
+\item
+  555: Raising a question of the privileges of the House
+  (\href{https://www.congress.gov/bill/116th-congress/house-resolution/590}{H.Res.590})
+\item
+  553:
+  {[}https://www.congress.gov/bill/116th-congress/senate-joint-resolution/54/actions{]}
+\item
+  527: On Agreeing to the Amendment
+  \href{https://www.congress.gov/bill/116th-congress/house-bill/1146}{H.R.1146
+  - Arctic Cultural and Coastal Plain Protection Act}
+\end{itemize}
+
+As shown in the demo, the primary goal of PCA is to transform
+observations from high-dimensional data down to low dimensions through
+linear transformations.
+
+\section{Example 2: Image
+Classification}\label{example-2-image-classification}
+
+In machine learning, PCA is often used as a preprocessing step prior to
+training a supervised model.
+
+Let's explore how PCA is useful for building an image classification
+model based on the Fashion-MNIST dataset, a dataset containing images of
+articles of clothing; these images are gray scale with a size of 28 by
+28 pixels. The copyright for Fashion-MNIST is held by
+\href{https://github.com/zalandoresearch/fashion-mnist}{Zalando SE}.
+Fashion-MNIST is licensed under the
+\href{https://github.com/zalandoresearch/fashion-mnist/blob/master/LICENSE}{MIT
+license}.
+
+First, we'll load in the data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{import}\NormalTok{ requests}
+\ImportTok{from}\NormalTok{ pathlib }\ImportTok{import}\NormalTok{ Path}
+\ImportTok{import}\NormalTok{ time}
+\ImportTok{import}\NormalTok{ gzip}
+\ImportTok{import}\NormalTok{ os}
+\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
+\ImportTok{import}\NormalTok{ plotly.express }\ImportTok{as}\NormalTok{ px}
+
+\KeywordTok{def}\NormalTok{ fetch\_and\_cache(data\_url, }\BuiltInTok{file}\NormalTok{, data\_dir}\OperatorTok{=}\StringTok{"data"}\NormalTok{, force}\OperatorTok{=}\VariableTok{False}\NormalTok{):}
+    \CommentTok{"""}
+\CommentTok{    Download and cache a url and return the file object.}
+
+\CommentTok{    data\_url: the web address to download}
+\CommentTok{    file: the file in which to save the results.}
+\CommentTok{    data\_dir: (default="data") the location to save the data}
+\CommentTok{    force: if true the file is always re{-}downloaded}
+
+\CommentTok{    return: The pathlib.Path object representing the file.}
+\CommentTok{    """}
+
+\NormalTok{    data\_dir }\OperatorTok{=}\NormalTok{ Path(data\_dir)}
+\NormalTok{    data\_dir.mkdir(exist\_ok}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{    file\_path }\OperatorTok{=}\NormalTok{ data\_dir }\OperatorTok{/}\NormalTok{ Path(}\BuiltInTok{file}\NormalTok{)}
+    \CommentTok{\# If the file already exists and we want to force a download then}
+    \CommentTok{\# delete the file first so that the creation date is correct.}
+    \ControlFlowTok{if}\NormalTok{ force }\KeywordTok{and}\NormalTok{ file\_path.exists():}
+\NormalTok{        file\_path.unlink()}
+    \ControlFlowTok{if}\NormalTok{ force }\KeywordTok{or} \KeywordTok{not}\NormalTok{ file\_path.exists():}
+        \BuiltInTok{print}\NormalTok{(}\StringTok{"Downloading..."}\NormalTok{, end}\OperatorTok{=}\StringTok{" "}\NormalTok{)}
+\NormalTok{        resp }\OperatorTok{=}\NormalTok{ requests.get(data\_url)}
+        \ControlFlowTok{with}\NormalTok{ file\_path.}\BuiltInTok{open}\NormalTok{(}\StringTok{"wb"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+\NormalTok{            f.write(resp.content)}
+        \BuiltInTok{print}\NormalTok{(}\StringTok{"Done!"}\NormalTok{)}
+\NormalTok{        last\_modified\_time }\OperatorTok{=}\NormalTok{ time.ctime(file\_path.stat().st\_mtime)}
+    \ControlFlowTok{else}\NormalTok{:}
+\NormalTok{        last\_modified\_time }\OperatorTok{=}\NormalTok{ time.ctime(file\_path.stat().st\_mtime)}
+        \BuiltInTok{print}\NormalTok{(}\StringTok{"Using cached version that was downloaded (UTC):"}\NormalTok{, last\_modified\_time)}
+    \ControlFlowTok{return}\NormalTok{ file\_path}
+
+
+\KeywordTok{def}\NormalTok{ head(filename, lines}\OperatorTok{=}\DecValTok{5}\NormalTok{):}
+    \CommentTok{"""}
+\CommentTok{    Returns the first few lines of a file.}
+
+\CommentTok{    filename: the name of the file to open}
+\CommentTok{    lines: the number of lines to include}
+
+\CommentTok{    return: A list of the first few lines from the file.}
+\CommentTok{    """}
+    \ImportTok{from}\NormalTok{ itertools }\ImportTok{import}\NormalTok{ islice}
+
+    \ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(filename, }\StringTok{"r"}\NormalTok{) }\ImportTok{as}\NormalTok{ f:}
+        \ControlFlowTok{return} \BuiltInTok{list}\NormalTok{(islice(f, lines))}
+
+
+\KeywordTok{def}\NormalTok{ load\_data():}
+    \CommentTok{"""}
+\CommentTok{    Loads the Fashion{-}MNIST dataset.}
+
+\CommentTok{    This is a dataset of 60,000 28x28 grayscale images of 10 fashion categories,}
+\CommentTok{    along with a test set of 10,000 images. This dataset can be used as}
+\CommentTok{    a drop{-}in replacement for MNIST.}
+
+\CommentTok{    The classes are:}
+
+\CommentTok{    | Label | Description |}
+\CommentTok{    |:{-}{-}{-}{-}{-}:|{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}|}
+\CommentTok{    |   0   | T{-}shirt/top |}
+\CommentTok{    |   1   | Trouser     |}
+\CommentTok{    |   2   | Pullover    |}
+\CommentTok{    |   3   | Dress       |}
+\CommentTok{    |   4   | Coat        |}
+\CommentTok{    |   5   | Sandal      |}
+\CommentTok{    |   6   | Shirt       |}
+\CommentTok{    |   7   | Sneaker     |}
+\CommentTok{    |   8   | Bag         |}
+\CommentTok{    |   9   | Ankle boot  |}
+
+\CommentTok{    Returns:}
+\CommentTok{      Tuple of NumPy arrays: \textasciigrave{}(x\_train, y\_train), (x\_test, y\_test)\textasciigrave{}.}
+
+\CommentTok{    **x\_train**: uint8 NumPy array of grayscale image data with shapes}
+\CommentTok{      \textasciigrave{}(60000, 28, 28)\textasciigrave{}, containing the training data.}
+
+\CommentTok{    **y\_train**: uint8 NumPy array of labels (integers in range 0{-}9)}
+\CommentTok{      with shape \textasciigrave{}(60000,)\textasciigrave{} for the training data.}
+
+\CommentTok{    **x\_test**: uint8 NumPy array of grayscale image data with shapes}
+\CommentTok{      (10000, 28, 28), containing the test data.}
+
+\CommentTok{    **y\_test**: uint8 NumPy array of labels (integers in range 0{-}9)}
+\CommentTok{      with shape \textasciigrave{}(10000,)\textasciigrave{} for the test data.}
+
+\CommentTok{    Example:}
+
+\CommentTok{    (x\_train, y\_train), (x\_test, y\_test) = fashion\_mnist.load\_data()}
+\CommentTok{    assert x\_train.shape == (60000, 28, 28)}
+\CommentTok{    assert x\_test.shape == (10000, 28, 28)}
+\CommentTok{    assert y\_train.shape == (60000,)}
+\CommentTok{    assert y\_test.shape == (10000,)}
+
+\CommentTok{    License:}
+\CommentTok{      The copyright for Fashion{-}MNIST is held by Zalando SE.}
+\CommentTok{      Fashion{-}MNIST is licensed under the [MIT license](}
+\CommentTok{      https://github.com/zalandoresearch/fashion{-}mnist/blob/master/LICENSE).}
+
+\CommentTok{    """}
+\NormalTok{    dirname }\OperatorTok{=}\NormalTok{ os.path.join(}\StringTok{"datasets"}\NormalTok{, }\StringTok{"fashion{-}mnist"}\NormalTok{)}
+\NormalTok{    base }\OperatorTok{=} \StringTok{"https://storage.googleapis.com/tensorflow/tf{-}keras{-}datasets/"}
+\NormalTok{    files }\OperatorTok{=}\NormalTok{ [}
+        \StringTok{"train{-}labels{-}idx1{-}ubyte.gz"}\NormalTok{,}
+        \StringTok{"train{-}images{-}idx3{-}ubyte.gz"}\NormalTok{,}
+        \StringTok{"t10k{-}labels{-}idx1{-}ubyte.gz"}\NormalTok{,}
+        \StringTok{"t10k{-}images{-}idx3{-}ubyte.gz"}\NormalTok{,}
+\NormalTok{    ]}
+
+\NormalTok{    paths }\OperatorTok{=}\NormalTok{ []}
+    \ControlFlowTok{for}\NormalTok{ fname }\KeywordTok{in}\NormalTok{ files:}
+\NormalTok{        paths.append(fetch\_and\_cache(base }\OperatorTok{+}\NormalTok{ fname, fname))}
+        \CommentTok{\# paths.append(get\_file(fname, origin=base + fname, cache\_subdir=dirname))}
+
+    \ControlFlowTok{with}\NormalTok{ gzip.}\BuiltInTok{open}\NormalTok{(paths[}\DecValTok{0}\NormalTok{], }\StringTok{"rb"}\NormalTok{) }\ImportTok{as}\NormalTok{ lbpath:}
+\NormalTok{        y\_train }\OperatorTok{=}\NormalTok{ np.frombuffer(lbpath.read(), np.uint8, offset}\OperatorTok{=}\DecValTok{8}\NormalTok{)}
+
+    \ControlFlowTok{with}\NormalTok{ gzip.}\BuiltInTok{open}\NormalTok{(paths[}\DecValTok{1}\NormalTok{], }\StringTok{"rb"}\NormalTok{) }\ImportTok{as}\NormalTok{ imgpath:}
+\NormalTok{        x\_train }\OperatorTok{=}\NormalTok{ np.frombuffer(imgpath.read(), np.uint8, offset}\OperatorTok{=}\DecValTok{16}\NormalTok{).reshape(}
+            \BuiltInTok{len}\NormalTok{(y\_train), }\DecValTok{28}\NormalTok{, }\DecValTok{28}
+\NormalTok{        )}
+
+    \ControlFlowTok{with}\NormalTok{ gzip.}\BuiltInTok{open}\NormalTok{(paths[}\DecValTok{2}\NormalTok{], }\StringTok{"rb"}\NormalTok{) }\ImportTok{as}\NormalTok{ lbpath:}
+\NormalTok{        y\_test }\OperatorTok{=}\NormalTok{ np.frombuffer(lbpath.read(), np.uint8, offset}\OperatorTok{=}\DecValTok{8}\NormalTok{)}
+
+    \ControlFlowTok{with}\NormalTok{ gzip.}\BuiltInTok{open}\NormalTok{(paths[}\DecValTok{3}\NormalTok{], }\StringTok{"rb"}\NormalTok{) }\ImportTok{as}\NormalTok{ imgpath:}
+\NormalTok{        x\_test }\OperatorTok{=}\NormalTok{ np.frombuffer(imgpath.read(), np.uint8, offset}\OperatorTok{=}\DecValTok{16}\NormalTok{).reshape(}
+            \BuiltInTok{len}\NormalTok{(y\_test), }\DecValTok{28}\NormalTok{, }\DecValTok{28}
+\NormalTok{        )}
+
+    \ControlFlowTok{return}\NormalTok{ (x\_train, y\_train), (x\_test, y\_test)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{class\_names }\OperatorTok{=}\NormalTok{ [}
+    \StringTok{"T{-}shirt/top"}\NormalTok{,}
+    \StringTok{"Trouser"}\NormalTok{,}
+    \StringTok{"Pullover"}\NormalTok{,}
+    \StringTok{"Dress"}\NormalTok{,}
+    \StringTok{"Coat"}\NormalTok{,}
+    \StringTok{"Sandal"}\NormalTok{,}
+    \StringTok{"Shirt"}\NormalTok{,}
+    \StringTok{"Sneaker"}\NormalTok{,}
+    \StringTok{"Bag"}\NormalTok{,}
+    \StringTok{"Ankle boot"}\NormalTok{,}
+\NormalTok{]}
+\NormalTok{class\_dict }\OperatorTok{=}\NormalTok{ \{i: class\_name }\ControlFlowTok{for}\NormalTok{ i, class\_name }\KeywordTok{in} \BuiltInTok{enumerate}\NormalTok{(class\_names)\}}
+
+\NormalTok{(train\_images, train\_labels), (test\_images, test\_labels) }\OperatorTok{=}\NormalTok{ load\_data()}
+\BuiltInTok{print}\NormalTok{(}\StringTok{"Training images"}\NormalTok{, train\_images.shape)}
+\BuiltInTok{print}\NormalTok{(}\StringTok{"Test images"}\NormalTok{, test\_images.shape)}
+
+\NormalTok{rng }\OperatorTok{=}\NormalTok{ np.random.default\_rng(}\DecValTok{42}\NormalTok{)}
+\NormalTok{n }\OperatorTok{=} \DecValTok{5000}
+\NormalTok{sample\_idx }\OperatorTok{=}\NormalTok{ rng.choice(np.arange(}\BuiltInTok{len}\NormalTok{(train\_images)), size}\OperatorTok{=}\NormalTok{n, replace}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+
+\CommentTok{\# Invert and normalize the images so they look better}
+\NormalTok{img\_mat }\OperatorTok{=} \OperatorTok{{-}}\DecValTok{1} \OperatorTok{*}\NormalTok{ train\_images[sample\_idx].astype(np.int16)}
+\NormalTok{img\_mat }\OperatorTok{=}\NormalTok{ (img\_mat }\OperatorTok{{-}}\NormalTok{ img\_mat.}\BuiltInTok{min}\NormalTok{()) }\OperatorTok{/}\NormalTok{ (img\_mat.}\BuiltInTok{max}\NormalTok{() }\OperatorTok{{-}}\NormalTok{ img\_mat.}\BuiltInTok{min}\NormalTok{())}
+
+\NormalTok{images }\OperatorTok{=}\NormalTok{ pd.DataFrame(}
+\NormalTok{    \{}
+        \StringTok{"images"}\NormalTok{: img\_mat.tolist(),}
+        \StringTok{"labels"}\NormalTok{: train\_labels[sample\_idx],}
+        \StringTok{"class"}\NormalTok{: [class\_dict[x] }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ train\_labels[sample\_idx]],}
+\NormalTok{    \}}
+\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Using cached version that was downloaded (UTC): Tue Aug 27 03:33:08 2024
+Using cached version that was downloaded (UTC): Tue Aug 27 03:33:08 2024
+Using cached version that was downloaded (UTC): Tue Aug 27 03:33:08 2024
+Using cached version that was downloaded (UTC): Tue Aug 27 03:33:08 2024
+Training images (60000, 28, 28)
+Test images (10000, 28, 28)
+\end{verbatim}
+
+Let's see what some of the images contained in this dataset look like.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{def}\NormalTok{ show\_images(images, ncols}\OperatorTok{=}\DecValTok{5}\NormalTok{, max\_images}\OperatorTok{=}\DecValTok{30}\NormalTok{):}
+    \CommentTok{\# conver the subset of images into a n,28,28 matrix for facet visualization}
+\NormalTok{    img\_mat }\OperatorTok{=}\NormalTok{ np.array(images.head(max\_images)[}\StringTok{"images"}\NormalTok{].to\_list())}
+\NormalTok{    fig }\OperatorTok{=}\NormalTok{ px.imshow(}
+\NormalTok{        img\_mat,}
+\NormalTok{        color\_continuous\_scale}\OperatorTok{=}\StringTok{"gray"}\NormalTok{,}
+\NormalTok{        facet\_col}\OperatorTok{=}\DecValTok{0}\NormalTok{,}
+\NormalTok{        facet\_col\_wrap}\OperatorTok{=}\NormalTok{ncols,}
+\NormalTok{        height}\OperatorTok{=}\DecValTok{220} \OperatorTok{*} \BuiltInTok{int}\NormalTok{(np.ceil(}\BuiltInTok{len}\NormalTok{(images) }\OperatorTok{/}\NormalTok{ ncols)),}
+\NormalTok{    )}
+\NormalTok{    fig.update\_layout(coloraxis\_showscale}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
+    \CommentTok{\# Extract the facet number and convert it back to the class label.}
+\NormalTok{    fig.for\_each\_annotation(}
+        \KeywordTok{lambda}\NormalTok{ a: a.update(text}\OperatorTok{=}\NormalTok{images.iloc[}\BuiltInTok{int}\NormalTok{(a.text.split(}\StringTok{"="}\NormalTok{)[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{])][}\StringTok{"class"}\NormalTok{])}
+\NormalTok{    )}
+    \ControlFlowTok{return}\NormalTok{ fig}
+
+
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ show\_images(images.groupby(}\StringTok{"class"}\NormalTok{, as\_index}\OperatorTok{=}\VariableTok{False}\NormalTok{).sample(}\DecValTok{2}\NormalTok{), ncols}\OperatorTok{=}\DecValTok{6}\NormalTok{)}
+\NormalTok{fig.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+Let's break this down further and look at it by class, or the category
+of clothing:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\BuiltInTok{print}\NormalTok{(class\_dict)}
+
+\NormalTok{show\_images(images.groupby(}\StringTok{\textquotesingle{}class\textquotesingle{}}\NormalTok{,as\_index}\OperatorTok{=}\VariableTok{False}\NormalTok{).sample(}\DecValTok{2}\NormalTok{), ncols}\OperatorTok{=}\DecValTok{6}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+{0: 'T-shirt/top', 1: 'Trouser', 2: 'Pullover', 3: 'Dress', 4: 'Coat', 5: 'Sandal', 6: 'Shirt', 7: 'Sneaker', 8: 'Bag', 9: 'Ankle boot'}
+\end{verbatim}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+\subsection{Raw Data}\label{raw-data}
+
+As we can see, each 28x28 pixel image is labelled by the category of
+clothing it belongs to. Us humans can very easily look at these images
+and identify the type of clothing being displayed, even if the image is
+a little blurry. However, this task is less intuitive for machine
+learning models. To illustrate this, let's take a small sample of the
+training data to see how the images above are represented in their raw
+format:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{images.head()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{longtable}[]{@{}llll@{}}
+\toprule\noalign{}
+& images & labels & class \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+0 & {[}{[}1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,... & 3 & Dress \\
+1 & {[}{[}1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,... & 4 & Coat \\
+2 & {[}{[}1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,... & 0 &
+T-shirt/top \\
+3 & {[}{[}1.0, 1.0, 1.0, 1.0, 1.0, 0.996078431372549, ... & 2 &
+Pullover \\
+4 & {[}{[}1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,... & 1 &
+Trouser \\
+\end{longtable}
+
+Each row represents one image. Every image belongs to a \texttt{"class"}
+of clothing with it's enumerated \texttt{"label"}. In place of a
+typically displayed image, the raw data contains a 28x28 \emph{2D array
+of pixel values}; each pixel value is a float between 0 and 1. If we
+just focus on the images, we get a 3D matrix. You can think of this as a
+matrix containing 2D images.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{X }\OperatorTok{=}\NormalTok{ np.array(images[}\StringTok{"images"}\NormalTok{].to\_list())}
+\NormalTok{X.shape }
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(5000, 28, 28)
+\end{verbatim}
+
+However, we're not used to working with 3D matrices for our training
+data \texttt{X}. Typical training data expects a \emph{vector} of
+features for each datapoint, not a matrix per datapoint. We can reshape
+our 3D matrix so that it fits our typical training data by ``unrolling''
+the the 28x28 pixels into a single row vector containing 28*28 = 784
+dimensions.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{X }\OperatorTok{=}\NormalTok{ X.reshape(X.shape[}\DecValTok{0}\NormalTok{], }\OperatorTok{{-}}\DecValTok{1}\NormalTok{)}
+\NormalTok{X.shape}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+(5000, 784)
+\end{verbatim}
+
+What we have now is 5000 datapoints that each have 784 features. That's
+a lot of features! Not only would training a model on this data take a
+very long time, it's also very likely that our matrix is linearly
+independent. PCA is a very good strategy to use in situations like these
+when there are lots of features, but we want to remove redundant
+information.
+
+\subsection{\texorpdfstring{PCA with
+\texttt{sklearn}}{PCA with sklearn}}\label{pca-with-sklearn}
+
+To perform PCA, let's begin by centering our data.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{X }\OperatorTok{=}\NormalTok{ X }\OperatorTok{{-}}\NormalTok{ X.mean(axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
+\end{Highlighting}
+\end{Shaded}
+
+We can run PCA using \texttt{sklearn}'s \texttt{PCA} package.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\ImportTok{from}\NormalTok{ sklearn.decomposition }\ImportTok{import}\NormalTok{ PCA}
+
+\NormalTok{n\_comps }\OperatorTok{=} \DecValTok{50}
+\NormalTok{pca }\OperatorTok{=}\NormalTok{ PCA(n\_components}\OperatorTok{=}\NormalTok{n\_comps)}
+\NormalTok{pca.fit(X)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+PCA(n_components=50)
+\end{verbatim}
+
+\subsection{Examining PCA Results}\label{examining-pca-results}
+
+Now that \texttt{sklearn} helped us find the principal components, let's
+visualize a scree plot.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\CommentTok{\# Make a line plot and show markers}
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.line(y}\OperatorTok{=}\NormalTok{pca.explained\_variance\_ratio\_ }\OperatorTok{*} \DecValTok{100}\NormalTok{, markers}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
+\NormalTok{fig.show()}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+We can see that the line starts flattening out around 2 or 3, which
+suggests that most of the data is explained by just the first two or
+three dimensions. To illustrate this, let's plot the first three
+principal components and the datapoints' corresponding classes. Can you
+identify any patterns?
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{images[[}\StringTok{\textquotesingle{}z1\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}z2\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}z3\textquotesingle{}}\NormalTok{]] }\OperatorTok{=}\NormalTok{ pca.transform(X)[:, :}\DecValTok{3}\NormalTok{]}
+\NormalTok{fig }\OperatorTok{=}\NormalTok{ px.scatter\_3d(images, x}\OperatorTok{=}\StringTok{\textquotesingle{}z1\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}z2\textquotesingle{}}\NormalTok{, z}\OperatorTok{=}\StringTok{\textquotesingle{}z3\textquotesingle{}}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}class\textquotesingle{}}\NormalTok{, hover\_data}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}labels\textquotesingle{}}\NormalTok{], }
+\NormalTok{              width}\OperatorTok{=}\DecValTok{1000}\NormalTok{, height}\OperatorTok{=}\DecValTok{800}\NormalTok{)}
+\CommentTok{\# set marker size to 5}
+\NormalTok{fig.update\_traces(marker}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(size}\OperatorTok{=}\DecValTok{5}\NormalTok{))}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+Unable to display output for mime type(s): text/html
+\end{verbatim}
+
+\section{Why Perform PCA}\label{why-perform-pca}
+
+As we saw in the demos, we often perform PCA during the Exploratory Data
+Analysis (EDA) stage of our data science lifecycle (if we already know
+what to model, we probably don't need PCA!). It helps us with:
+
+\begin{itemize}
+\tightlist
+\item
+  Visually identifying clusters of similar observations in high
+  dimensions.
+\item
+  Removing irrelevant dimensions if we suspect that the dataset is
+  inherently low rank. For example, if the columns are collinear: there
+  are many attributes but only a few mostly determine the rest through
+  linear associations.
+\item
+  Finding a small basis for representing variations in complex things,
+  e.g., images, genes.
+\item
+  Reducing the number of dimensions to make some computations cheaper.
+\end{itemize}
+
+\subsection{Why PCA, then Model?}\label{why-pca-then-model}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Reduces dimensionality, allowing us to speed up training and reduce
+  the number of features, etc.
+\item
+  Avoids multicollinearity in the new features created (i.e.~the
+  principal components)
+\end{enumerate}
+
+\section{(Bonus) Applications of PCA}\label{bonus-applications-of-pca}
+
+\subsection{PCA in Biology}\label{pca-in-biology}
+
+PCA is commonly used in biomedical contexts, which have many named
+variables! It can be used to:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Cluster data
+  (\href{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2680-1}{Paper
+  1},
+  \href{https://www.science.org/doi/10.1126/scirobotics.abk2378}{Paper
+  2}).
+\item
+  Identify correlated variables
+  (\href{https://docs.google.com/presentation/d/1-aDu0ILCkPx3iCcJGB3YXci-L4g90Q6AarXU6wffLB8/edit\#slide=id.g62cb86badb_0_1128}{interpret}
+  rows of \(V^{T}\) as linear coefficients)
+  (\href{https://www.nature.com/articles/s41598-017-05714-1}{Paper 3}).
+  Uses
+  \href{https://www.google.com/url?q=https://www.geo.fu-berlin.de/en/v/soga/Geodata-analysis/Principal-Component-Analysis/principal-components-basics/Interpretation-and-visualization/index.html\%23:~:text\%3DThe\%2520biplot\%2520is\%2520a\%2520very,in\%2520a\%2520single\%2520biplot\%2520display.\%26text\%3DThe\%2520plot\%2520shows\%2520the\%2520observations,principal\%2520components\%2520(synthetic\%2520variables).&sa=D&source=editors&ust=1682131633152964&usg=AOvVaw2H9SOeMP5kUS890Fkhfthx}{biplots}.
+\end{enumerate}
+
+\section{(Bonus) PCA vs.~Regression}\label{bonus-pca-vs.-regression}
+
+\subsection{Regression: Minimizing Horizontal/Verticle
+Error}\label{regression-minimizing-horizontalverticle-error}
+
+Suppose we know the child mortality rate of a given country. Linear
+regression tries to predict the fertility rate from the mortality rate;
+for example, if the mortality is 6, we might guess the fertility is near
+4. The regression line tells us the ``best'' prediction of fertility
+given all possible mortality values by minimizing the root mean squared
+error. See the vertical red lines (note that only some are shown).
+
+We can also perform a regression in the reverse direction. That is,
+given fertility, we try to predict mortality. In this case, we get a
+different regression line that minimizes the root mean squared length of
+the horizontal lines.
+
+\subsection{SVD: Minimizing Perpendicular
+Error}\label{svd-minimizing-perpendicular-error}
+
+The rank-1 approximation is close but not the same as the mortality
+regression line. Instead of minimizing \emph{horizontal} or
+\emph{vertical} error, our rank-1 approximation minimizes the error
+\emph{perpendicular} to the subspace onto which we're projecting. That
+is, SVD finds the line such that if we project our data onto that line,
+the error between the projection and our original data is minimized. The
+similarity of the rank-1 approximation and the fertility was just a
+coincidence. Looking at adiposity and bicep size from our body
+measurements dataset, we see the 1D subspace onto which we are
+projecting is between the two regression lines.
+
+\subsection{Beyond 1D and 2D}\label{beyond-1d-and-2d}
+
+Even in higher dimensions, the idea behind principal components is the
+same! Suppose we have 30-dimensional data and decide to use the first 5
+principal components. Our procedure minimizes the error between the
+original 30-dimensional data and the projection of that 30-dimensional
+data onto the ``best'' 5-dimensional subspace. See
+\href{https://eecs189.org/docs/notes/n10.pdf}{CS 189 Note 10} for more
+details.
+
+\section{(Bonus) Automatic
+Factorization}\label{bonus-automatic-factorization}
+
+One key fact to remember is that the decomposition is not arbitrary. The
+\emph{rank} of a matrix limits how small our inner dimensions can be if
+we want to perfectly recreate our matrix. The proof for this is out of
+scope.
+
+Even if we know we have to factorize our matrix using an inner dimension
+of \(R\), that still leaves a large space of solutions to traverse. What
+if we have a procedure to automatically factorize a rank \(R\) matrix
+into an \(R\)-dimensional representation with some transformation
+matrix?
+
+\begin{itemize}
+\tightlist
+\item
+  Lower dimensional representation avoids redundant features.
+\item
+  Imagine a 1000-dimensional dataset: If the rank is only 5, it's much
+  easier to do EDA after this mystery procedure.
+\end{itemize}
+
+What if we wanted a 2D representation? It's valuable to compress all of
+the data that is relevant into as few dimensions as possible in order to
+plot it efficiently. Some 2D matrices yield better approximations than
+others. How well can we do?
+
+\section{(Bonus) Proof of Component
+Score}\label{bonus-proof-of-component-score}
+
+The proof defining component score is out of scope for this class, but
+it is included below for your convenience.
+
+\textbf{Setup}: Consider the design matrix
+\(X \in \mathbb{R}^{n \times d}\), where the \(j\)-th column
+(corresponding to the \(j\)-th feature) is \(x_j \in \mathbb{R}^n\) and
+the element in row \(i\), column \(j\) is \(x_{ij}\). Further, define
+\(\tilde{X}\) as the \textbf{centered} design matrix. The \(j\)-th
+column is \(\tilde{x}_j \in \mathbb{R}^n\) and the element in row \(i\),
+column \(j\) is \(\tilde{x}_{ij} = x_{ij} - \bar{x_j}\), where
+\(\bar{x_j}\) is the mean of the \(x_j\) column vector from the original
+\(X\).
+
+\textbf{Variance}: Construct the \textbf{covariance matrix}:
+\(\frac{1}{n} \tilde{X}^T \tilde{X} \in \mathbb{R}^{d \times d}\). The
+\(j\)-th element along the diagonal is the \textbf{variance} of the
+\(j\)-th column of the original design matrix \(X\):
+
+\[\left( \frac{1}{n} \tilde{X}^T \tilde{X} \right)_{jj} = \frac{1}{n} \tilde{x}_j ^T \tilde{x}_j = \frac{1}{n} \sum_{i=i}^n (\tilde{x}_{ij} )^2 = \frac{1}{n} \sum_{i=i}^n (x_{ij} - \bar{x_j})^2\]
+
+\textbf{SVD}: Suppose singular value decomposition of the
+\emph{centered} design matrix \(\tilde{X}\) yields
+\(\tilde{X} = U S V^T\), where \(U \in \mathbb{R}^{n \times d}\) and
+\(V \in \mathbb{R}^{d \times d}\) are matrices with orthonormal columns,
+and \(S \in \mathbb{R}^{d \times d}\) is a diagonal matrix with singular
+values of \(\tilde{X}\).
+
+\[
+\begin{aligned}
+\tilde{X}^T \tilde{X} &= (U S V^T )^T (U S V^T) \\
+&= V S U^T U S V^T  & (S^T = S) \\
+&= V S^2 V^T & (U^T U = I) \\
+\frac{1}{n} \tilde{X}^T \tilde{X} &= \frac{1}{n} V S V^T =V \left( \frac{1}{n} S \right) V^T \\
+\frac{1}{n} \tilde{X}^T \tilde{X} V &= V \left( \frac{1}{n} S \right) V^T V = V \left( \frac{1}{n} S \right) & \text{(right multiply by }V \rightarrow V^T V = I \text{)} \\
+V^T \frac{1}{n} \tilde{X}^T \tilde{X} V &= V^T V \left( \frac{1}{n} S \right) = \frac{1}{n} S & \text{(left multiply by }V^T \rightarrow V^T V = I \text{)} \\
+\left( \frac{1}{n} \tilde{X}^T \tilde{X} \right)_{jj} &= \frac{1}{n}S_j^2  & \text{(Define }S_j\text{ as the} j\text{-th singular value)} \\
+\frac{1}{n} S_j^2 &= \frac{1}{n} \sum_{i=i}^n (x_{ij} - \bar{x_j})^2
+\end{aligned}
+\]
+
+The last line defines the \(j\)-th component score.
+
+\bookmarksetup{startatroot}
+
+\chapter{Clustering}\label{clustering}
+
+\begin{tcolorbox}[enhanced jigsaw, colframe=quarto-callout-note-color-frame, left=2mm, breakable, opacitybacktitle=0.6, bottomrule=.15mm, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Learning Outcomes}, colback=white, coltitle=black, rightrule=.15mm, colbacktitle=quarto-callout-note-color!10!white, bottomtitle=1mm, toprule=.15mm, toptitle=1mm, leftrule=.75mm, titlerule=0mm, arc=.35mm]
+
+\begin{itemize}
+\tightlist
+\item
+  Introduction to clustering
+\item
+  Assessing the taxonomy of clustering approaches
+\item
+  K-Means clustering
+\item
+  Clustering with no explicit loss function: minimizing inertia
+\item
+  Hierarchical Agglomerative Clustering
+\item
+  Picking K: a hyperparameter
+\end{itemize}
+
+\end{tcolorbox}
+
+Last time, we began our journey into unsupervised learning by discussing
+Principal Component Analysis (PCA).
+
+In this lecture, we will explore another very popular unsupervised
+learning concept: clustering. Clustering allows us to ``group'' similar
+datapoints together without being given labels of what ``class'' or
+where each point explicitly comes from. We will discuss two clustering
+algorithms: K-Means clustering and hierarchical agglomerative
+clustering, and we'll examine the assumptions, strengths, and drawbacks
+of each one.
+
+\section{Review: Taxonomy of Machine
+Learning}\label{review-taxonomy-of-machine-learning}
+
+\subsection{Supervised Learning}\label{supervised-learning}
+
+In supervised learning, our goal is to create a function that maps
+inputs to outputs. Each model is learned from example input/output pairs
+(training set), validated using input/output pairs, and eventually
+tested on more input/output pairs. Each pair consists of:
+
+\begin{itemize}
+\tightlist
+\item
+  Input vector
+\item
+  Output value (\textbf{label})
+\end{itemize}
+
+In regression, our output value is quantitative, and in classification,
+our output value is categorical.
+
+\begin{figure}[H]
+
+{\centering \includegraphics{clustering/images/ml_taxonomy.png}
+
+}
+
+\caption{ML taxonomy}
+
+\end{figure}%
+
+\subsection{Unsupervised Learning}\label{unsupervised-learning}
+
+In unsupervised learning, our goal is to identify patterns in
+\textbf{unlabeled} data. In this type of learning, we do not have
+input/output pairs. Sometimes, we may have labels but choose to ignore
+them (e.g.~PCA on labeled data). Instead, we are more interested in the
+inherent structure of the data we have rather than trying to simply
+predict a label using that structure of data. For example, if we are
+interested in dimensionality reduction, we can use PCA to reduce our
+data to a lower dimension.
+
+Now, let's consider a new problem: clustering.
+
+\subsection{Clustering Examples}\label{clustering-examples}
+
+\subsubsection{Example 1}\label{example-1-1}
+
+Consider this figure from Fall 2019 Midterm 2. The original dataset had
+8 dimensions, but we have used PCA to reduce our data down to 2
+dimensions.
+
+Each point represents the 1st and 2nd principal component of how much
+time patrons spent at 8 different zoo exhibits. Visually and
+intuitively, we could potentially guess that this data belongs to 3
+groups: one for each cluster. The goal of clustering is now to assign
+each point (in the 2 dimensional PCA representation) to a cluster.
+
+This is an unsupervised task, as:
+
+\begin{itemize}
+\tightlist
+\item
+  We don't have labels for each visitor.
+\item
+  We want to infer patterns even without labels.
+\end{itemize}
+
+\subsubsection{Example 2: Netflix}\label{example-2-netflix}
+
+Now suppose you're Netflix and are looking at information on customer
+viewing habits. Clustering can come in handy here. We can assign each
+person or show to a ``cluster.'' (Note: while we don't know for sure
+that Netflix actually uses ML clustering to identify these categories,
+they could, in principle.)
+
+Keep in mind that with clustering, we don't need to define clusters in
+advance; it discovers groups automatically. On the other hand, with
+classification, we have to decide labels in advance. This marks one of
+the key differences between clustering and classification.
+
+\subsubsection{Example 3: Education}\label{example-3-education}
+
+Let's say we're working with student-generated materials and pass them
+into the S-BERT module to extract sentence embeddings. Features from
+clusters are extracted to:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  Detect anomalies in group activities
+\item
+  Predict the group's median quiz grade
+\end{enumerate}
+
+Here we can see the outline of the anomaly detection module. It consists
+of:
+
+\begin{itemize}
+\tightlist
+\item
+  S-BERT feature extraction
+\item
+  Topic extraction
+\item
+  Feature extraction
+\item
+  16D \(\rightarrow\) 2D PCA dimensionality reduction and 2D
+  \(\rightarrow\) 16D reconstruction
+\item
+  Anomaly detection based on reconstruction error
+\end{itemize}
+
+Looking more closely at our clustering, we can better understand the
+different components, which are represented by the centers. Below we
+have two examples.
+
+Note that the details for this example are not in scope.
+
+\subsubsection{Example 4: Reverse Engineering
+Biology}\label{example-4-reverse-engineering-biology}
+
+Now, consider the plot below:
+
+The rows of this plot are conditions (e.g., a row might be: ``poured
+acid on the cells''), and the columns are genes. The green coloration
+indicates that the gene was ``off'' whereas red indicates the gene was
+``on''. For example, the \textasciitilde9 genes in the top left corner
+of the plot were all turned off by the 6 experiments (rows) at the top.
+
+In a clustering lens, we might be interested in clustering similar
+observations together based on the reactions (on/off) to certain
+experiments.
+
+For example, here is a look at our data before and after clustering.
+
+Note: apologies if you can't differentiate red from green by eye!
+Historical visualizations are not always the best.
+
+\section{Taxonomy of Clustering
+Approaches}\label{taxonomy-of-clustering-approaches}
+
+There are many types of clustering algorithms, and they all have
+strengths, inherent weaknesses, and different use cases. We will first
+focus on a partitional approach: K-Means clustering.
+
+\section{K-Means Clustering}\label{k-means-clustering}
+
+The most popular clustering approach is K-Means. The algorithm itself
+entails the following:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\item
+  Pick an arbitrary \(k\), and randomly place \(k\) ``centers'', each a
+  different color.
+\item
+  Repeat until convergence:
+
+  \begin{enumerate}
+  \def\labelenumii{\alph{enumii}.}
+  \tightlist
+  \item
+    Color points according to the closest center.
+  \item
+    Move the center for each color to the center of points with that
+    color.
+  \end{enumerate}
+\end{enumerate}
+
+Consider the following data with an arbitrary \(k = 2\) and randomly
+placed ``centers'' denoted by the different colors (blue, orange):
+
+Now, we will follow the rest of the algorithm. First, let us color each
+point according to the closest center:
+
+Next, we will move the center for each color to the center of points
+with that color. Notice how the centers are generally well-centered
+amongst the data that shares its color.
+
+Assume this process (re-color and re-set centers) repeats for a few more
+iterations. We eventually reach this state.
+
+After this iteration, the center stays still and does not move at all.
+Thus, we have converged, and the clustering is complete!
+
+\subsubsection{A Quick Note}\label{a-quick-note}
+
+K-Means is a completely different algorithm than K-Nearest Neighbors.
+K-means is used for \emph{clustering}, where each point is assigned to
+one of \(K\) clusters. On the other hand, K-Nearest Neighbors is used
+for \emph{classification} (or, less often, regression), and the
+predicted value is typically the most common class among the
+\(K\)-nearest data points in the training set. The names may be similar,
+but there isn't really anything in common.
+
+\section{Minimizing Inertia}\label{minimizing-inertia}
+
+Consider the following example where \(K = 4\):
+
+Due to the randomness of where the \(K\) centers initialize/start, you
+will get a different output/clustering every time you run K-Means.
+Consider three possible K-Means outputs; the algorithm has converged,
+and the colors denote the final cluster they are clustered as.
+
+Which clustering output is the best? To evaluate different clustering
+results, we need a loss function.
+
+The two common loss functions are:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Inertia}: Sum of squared distances from each data point to its
+  center.
+\item
+  \textbf{Distortion}: Weighted sum of squared distances from each data
+  point to its center.
+\end{itemize}
+
+In the example above:
+
+\begin{itemize}
+\tightlist
+\item
+  Calculated inertia:
+  \(0.47^2 + 0.19^2 + 0.34^2 + 0.25^2 + 0.58^2 + 0.36^2 + 0.44^2\)
+\item
+  Calculated distortion:
+  \(\frac{0.47^2 + 0.19^2 + 0.34^2}{3} + \frac{0.25^2 + 0.58^2 + 0.36^2 + 0.44^2}{4}\)
+\end{itemize}
+
+Switching back to the four-cluster example at the beginning of this
+section, \texttt{random.seed(25)} had an inertia of \texttt{44.96},
+\texttt{random.seed(29)} had an inertia of \texttt{45.95}, and
+\texttt{random.seed(40)} had an inertia of \texttt{54.35}. It seems that
+the best clustering output was \texttt{random.seed(25)} with an inertia
+of \texttt{44.96}!
+
+It turns out that the function K-Means is trying to minimize is inertia,
+but often fails to find global optimum. Why does this happen? We can
+think of K-means as a pair of optimizers that take turns. The first
+optimizer holds \emph{center positions} constant and optimizes
+\emph{data colors}. The second optimizer holds \emph{data colors}
+constant and optimizes \emph{center positions}. Neither optimizer gets
+full control!
+
+This is a hard problem: give an algorithm that optimizes inertia FOR A
+GIVEN \(K\); \(K\) is picked in advance. Your algorithm should return
+the EXACT best centers and colors, but you don't need to worry about
+runtime.
+
+\emph{Note: This is a bit of a CS61B/CS70/CS170 problem, so do not worry
+about completely understanding the tricky predicament we are in too
+much!}
+
+A potential algorithm:
+
+\begin{itemize}
+\tightlist
+\item
+  For all possible \(k^n\) colorings:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Compute the \(k\) centers for that coloring.
+  \item
+    Compute the inertia for the \(k\) centers.
+
+    \begin{itemize}
+    \tightlist
+    \item
+      If current inertia is better than best known, write down the
+      current centers and coloring and call that the new best known.
+    \end{itemize}
+  \end{itemize}
+\end{itemize}
+
+No better algorithm has been found for solving the problem of minimizing
+inertia exactly.
+
+\section{Hierarchical Agglomerative
+Clustering}\label{hierarchical-agglomerative-clustering}
+
+Now, let us consider hierarchical agglomerative clustering.
+
+Consider the following results of two K-Means clustering outputs:
+
+Which clustering result do you like better? It seems K-Means likes the
+one on the right better because it has lower inertia (the sum of squared
+distances from each data point to its center), but this raises some
+questions:
+
+\begin{itemize}
+\tightlist
+\item
+  Why is the inertia on the right lower? K-Means optimizes for distance,
+  not ``blobbiness''.
+\item
+  Is clustering on the right ``wrong''? Good question!
+\end{itemize}
+
+Now, let us introduce Hierarchical Agglomerative Clustering! We start
+with every data point in a separate cluster, and we'll keep merging the
+most similar pairs of data points/clusters until we have one big cluster
+left. This is called a \textbf{bottom-up} or \textbf{agglomerative
+method}.
+
+There are various ways to decide the order of combining clusters called
+\textbf{Linkage Criterion}:
+
+\begin{itemize}
+\tightlist
+\item
+  \textbf{Single linkage} (similarity of the most similar): the distance
+  between two clusters as the \textbf{minimum} distance between a point
+  in the first cluster and a point in the second.
+\item
+  \textbf{Complete linkage} (similarity of the least similar): the
+  distance between two clusters as the \textbf{maximum} distance between
+  a point in the first cluster and a point in the second.
+\item
+  \textbf{Average linkage}: \textbf{average} similarity of pairs of
+  points in clusters.
+\end{itemize}
+
+The linkage criterion decides how we measure the ``distance'' between
+two clusters. Regardless of the criterion we choose, the aim is to
+combine the two clusters that have the minimum ``distance'' between
+them, with the distance computed as per that criterion. In the case of
+complete linkage, for example, that means picking the two clusters that
+minimize the maximum distance between a point in the first cluster and a
+point in the second.
+
+When the algorithm starts, every data point is in its own cluster. In
+the plot below, there are 12 data points, so the algorithm starts with
+12 clusters. As the clustering begins, it assesses which clusters are
+the closest together.
+
+The closest clusters are 10 and 11, so they are merged together.
+
+Next, points 0 and 4 are merged together because they are closest.
+
+At this point, we have 10 clusters: 8 with a single point (clusters 1,
+2, 3, 4, 5, 6, 7, 8, and 9) and 2 with 2 points (clusters 0 and 10).
+
+Although clusters 0 and 3 are not the closest, let us consider if we
+were trying to merge them. A tricky question arises: what is the
+``distance'' between clusters 0 and 3? We can use the
+\textbf{Complete-Link} approach that uses the \textbf{max} distance
+among all pairs of points between groups to decide which group has
+smaller ``distance''.
+
+Let us assume the algorithm runs a little longer, and we have reached
+the following state. Clusters 0 and 7 are up next, but why? The
+\textbf{max line between any member of 0 and 6} is longer than the
+\textbf{max line between any member of 0 and 7}.
+
+Thus, 0 and 7 are merged into 0 as they are closer under the complete
+linkage criterion.
+
+After more iterations, we finally converge to the plot on the left.
+There are two clusters (0, 1), and the agglomerative algorithm has
+converged.
+
+Notice that on the full dataset, our agglomerative clustering algorithm
+achieves the more ``correct'' output.
+
+\subsection{Clustering, Dendrograms, and
+Intuition}\label{clustering-dendrograms-and-intuition}
+
+Agglomerative clustering is one form of ``hierarchical clustering.'' It
+is interpretable because we can keep track of when two clusters got
+merged (each cluster is a tree), and we can visualize the merging
+hierarchy, resulting in a ``dendrogram.'' Won't discuss this any further
+for this course, but you might see these in the wild. Here are some
+examples:
+
+Some professors use agglomerative clustering for grading bins; if there
+is a big gap between two people, draw a grading threshold there. The
+idea is that grade clustering should be more like the figure below on
+the left, not the right.
+
+\section{Picking K}\label{picking-k}
+
+The algorithms we've discussed require us to pick a \(K\) before we
+start. But how do we pick \(K\)? Often, the best \(K\) is subjective.
+For example, consider the state plot below.
+
+How many clusters are there here? For K-Means, one approach to determine
+this is to plot inertia versus many different \(K\) values. We'd pick
+the \(K\) in the \textbf{elbow}, where we get diminishing returns
+afterward. Note that big, complicated data often lacks an elbow, so this
+method is not foolproof. Here, we would likely select \(K = 2\).
+
+\subsection{Silhouette Scores}\label{silhouette-scores}
+
+To evaluate how ``well-clustered'' a specific data point is, we can use
+the \textbf{silhouette score}, also termed the \textbf{silhouette
+width}. A high silhouette score indicates that a point is near the other
+points in its cluster; a low score means that it's far from the other
+points in its cluster.
+
+For a data point \(X\), score \(S\) is: \[S =\frac{B - A}{\max(A, B)}\]
+where \(A\) is the average distance to \emph{other} points in the
+cluster, and \(B\) is the average distance to points in the
+\emph{closest} cluster.
+
+Consider what the highest possible value of \(S\) is and how that value
+can occur. The highest possible value of \(S\) is 1, which happens if
+every point in \(X\)'s cluster is right on top of \(X\); the average
+distance to other points in \(X\)'s cluster is \(0\), so \(A = 0\).
+Thus, \(S = \frac{B}{\max(0, B)} = \frac{B}{B} = 1\). Another case where
+\(S = 1\) could happen is if \(B\) is \emph{much} greater than \(A\) (we
+denote this as \(B >> A\)).
+
+Can \(S\) be negative? The answer is yes. If the average distance to X's
+clustermates is larger than the distance to the closest cluster, then
+this is possible. For example, the ``low score'' point on the right of
+the image above has \(S = -0.13\).
+
+\subsection{Silhouette Plot}\label{silhouette-plot}
+
+We can plot the \textbf{silhouette scores} for all of our datapoints.
+The x-axis represents the silhouette coefficient value or silhouette
+score. The y-axis tells us which cluster label the points belong to, as
+well as the number of points within a particular cluster. Points with
+large silhouette widths are deeply embedded in their cluster; the red
+dotted line shows the average. Below, we plot the silhouette score for
+our plot with \(K=2\).
+
+Similarly, we can plot the silhouette score for the same dataset but
+with \(K=3\):
+
+The average silhouette score is lower with 3 clusters, so \(K=2\) is a
+better choice. This aligns with our visual intuition as well.
+
+\subsection{Picking K: Real World
+Metrics}\label{picking-k-real-world-metrics}
+
+Sometimes you can rely on real-world metrics to guide your choice of
+\(K\). For t-shirts, we can either:
+
+\begin{itemize}
+\tightlist
+\item
+  Cluster heights and weights of customers with \(K = 3\) to design
+  Small, Medium, and Large shirts
+\item
+  Cluster heights and weights of customers with \(K = 5\) to design XS,
+  S, M, L, and XL shirts
+\end{itemize}
+
+To choose \(K\), consider projected costs and sales for the 2 different
+\(K\)s and select the one that maximizes profit.
+
+\section{Conclusion}\label{conclusion-1}
+
+We've now discussed a new machine learning goal ------ clustering ------
+and explored two solutions:
+
+\begin{itemize}
+\tightlist
+\item
+  K-Means Clustering tries to optimize a loss function called inertia
+  (no known algorithm to find the optimal answer in an efficient manner)
+\item
+  Hierarchical Agglomerative Clustering builds clusters bottom-up by
+  merging clusters ``close'' to each other, depending on the choice of
+  linkage.
+\end{itemize}
+
+Our version of these algorithms required a hyperparameter \(K\). There
+are 4 ways to pick \(K\): the elbow method, silhouette scores, and by
+harnessing real-world metrics.
+
+There are many machine learning problems. Each can be addressed by many
+different solution techniques. Each has many metrics for evaluating
+success / loss. Many techniques can be used to solve different problem
+types. For example, linear models can be used for regression and
+classification.
+
+We've only scratched the surface and haven't discussed many important
+ideas, such as neural networks and deep learning. In the last lecture,
+we'll provide some specific course recommendations on how to explore
+these topics further.
+
+
+
+
+\end{document}
diff --git a/index.toc b/index.toc
new file mode 100644
index 000000000..e69de29bb